From cf6d9f8190363bf63eb0908d798915294de2749d Mon Sep 17 00:00:00 2001
From: klaricch <kristen@broadinstitute.org>
Date: Wed, 4 May 2022 11:31:03 -0400
Subject: [PATCH 1/6] add pc_names param

---
 gnomad/sample_qc/ancestry.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py
index 0c7a2e9c4..f4f8813d5 100644
--- a/gnomad/sample_qc/ancestry.py
+++ b/gnomad/sample_qc/ancestry.py
@@ -118,6 +118,7 @@ def pc_project(
 def assign_population_pcs(
     pop_pca_scores: Union[hl.Table, pd.DataFrame],
     pc_cols: Union[hl.expr.ArrayExpression, List[str]],
+    pc_names: List[int],
     known_col: str = "known_pop",
     fit: Any = None,  # Type should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside
     seed: int = 42,
@@ -149,6 +150,7 @@ def assign_population_pcs(
 
     :param pop_pca_scores: Input Hail Table or Pandas Dataframe
     :param pc_cols: Columns storing the PCs to use
+    :param pc_names: List fo integer to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3)
     :param known_col: Column storing the known population labels
     :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
     :param seed: Random seed
@@ -172,10 +174,8 @@ def assign_population_pcs(
 
         # Explode the PC array
         num_out_cols = min([len(x) for x in pop_pc_pd["pca_scores"].values.tolist()])
-        pc_cols = [f"PC{i+1}" for i in range(num_out_cols)]
-        pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())[
-            list(range(num_out_cols))
-        ]
+        pc_cols = [f"PC{i}" for i in pc_names]
+        pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())
 
     else:
         pop_pc_pd = pop_pca_scores
@@ -230,8 +230,14 @@ def assign_population_pcs(
 
     if hail_input:
         pops_ht = hl.Table.from_pandas(pop_pc_pd, key=list(pop_pca_scores.key))
-        pops_ht.annotate_globals(
-            assign_pops_from_pc_params=hl.struct(min_assignment_prob=min_prob)
+        pops_ht = pops_ht.annotate_globals(
+            assign_pops_from_pc_params=hl.struct(
+                min_assignment_prob=min_prob, error_rate=error_rate
+            )
+        )
+        pops_ht = pops_ht.annotate(
+            evaluation_sample=hl.literal(list(evaluate_fit.s)).contains(pops_ht.s),
+            training_sample=hl.literal(list(train_fit.s)).contains(pops_ht.s),
         )
         return pops_ht, pop_clf
     else:

From d5714339b7ef264cc994f5925839c2d27922899f Mon Sep 17 00:00:00 2001
From: klaricch <kristen@broadinstitute.org>
Date: Wed, 4 May 2022 11:37:07 -0400
Subject: [PATCH 2/6] small edit

---
 gnomad/sample_qc/ancestry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py
index f4f8813d5..d4772cf4b 100644
--- a/gnomad/sample_qc/ancestry.py
+++ b/gnomad/sample_qc/ancestry.py
@@ -150,7 +150,7 @@ def assign_population_pcs(
 
     :param pop_pca_scores: Input Hail Table or Pandas Dataframe
     :param pc_cols: Columns storing the PCs to use
-    :param pc_names: List fo integer to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3)
+    :param pc_names: List of integers to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3)
     :param known_col: Column storing the known population labels
     :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
     :param seed: Random seed

From 7a6819af30ac0d041bf7a620ead4a54d01f6db3f Mon Sep 17 00:00:00 2001
From: klaricch <kristen@broadinstitute.org>
Date: Wed, 4 May 2022 17:23:14 -0400
Subject: [PATCH 3/6] edit pc_cols param

---
 gnomad/sample_qc/ancestry.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py
index d4772cf4b..72cbcbe08 100644
--- a/gnomad/sample_qc/ancestry.py
+++ b/gnomad/sample_qc/ancestry.py
@@ -117,8 +117,7 @@ def pc_project(
 
 def assign_population_pcs(
     pop_pca_scores: Union[hl.Table, pd.DataFrame],
-    pc_cols: Union[hl.expr.ArrayExpression, List[str]],
-    pc_names: List[int],
+    pc_cols: Union[List[int], List[str]],
     known_col: str = "known_pop",
     fit: Any = None,  # Type should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside
     seed: int = 42,
@@ -137,7 +136,7 @@ def assign_population_pcs(
 
     As input, this function can either take:
         - A Hail Table (typically the output of `hwe_normalized_pca`). In this case,
-            - `pc_cols` should be an ArrayExpression of Floats where each element is one of the PCs to use.
+            - `pc_cols` should be a list of integers where each element is one of the PCs to use.
             - A Hail Table will be returned as output
         - A Pandas DataFrame. In this case:
             - Each PC should be in a separate column and `pc_cols` is the list of all the columns containing the PCs to use.
@@ -149,8 +148,7 @@ def assign_population_pcs(
         can be used to expand this column into multiple `PC` columns.
 
     :param pop_pca_scores: Input Hail Table or Pandas Dataframe
-    :param pc_cols: Columns storing the PCs to use
-    :param pc_names: List of integers to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3)
+    :param pc_cols: List of which PCS to use/columns storing the PCs to use (i.e. [1,2,4,5])
     :param known_col: Column storing the known population labels
     :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
     :param seed: Random seed
@@ -165,16 +163,16 @@ def assign_population_pcs(
 
     hail_input = isinstance(pop_pca_scores, hl.Table)
     if hail_input:
+        pcs_to_pull = [pop_pca_scores.scores[i - 1] for i in pc_cols]
         if not fit:
-            pop_pca_scores = pop_pca_scores.select(known_col, pca_scores=pc_cols)
+            pop_pca_scores = pop_pca_scores.select(known_col, pca_scores=pcs_to_pull)
         else:
-            pop_pca_scores = pop_pca_scores.select(pca_scores=pc_cols)
+            pop_pca_scores = pop_pca_scores.select(pca_scores=pcs_to_pull)
 
         pop_pc_pd = pop_pca_scores.to_pandas()
 
         # Explode the PC array
-        num_out_cols = min([len(x) for x in pop_pc_pd["pca_scores"].values.tolist()])
-        pc_cols = [f"PC{i}" for i in pc_names]
+        pc_cols = [f"PC{i}" for i in pc_cols]
         pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())
 
     else:
@@ -295,4 +293,4 @@ def run_pca_with_relateds(
         )
         related_scores = pc_project(related_mt, pca_loadings)
         pca_scores = pca_scores.union(related_scores)
-        return pca_evals, pca_scores, pca_loadings
+        return pca_evals, pca_scores, pca_loadings
\ No newline at end of file

From da157dca7105ee1a83adf74b7ac87c4672bf6001 Mon Sep 17 00:00:00 2001
From: klaricch <kristen@broadinstitute.org>
Date: Wed, 4 May 2022 18:20:25 -0400
Subject: [PATCH 4/6] run black

---
 gnomad/sample_qc/ancestry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py
index 72cbcbe08..cad8d2778 100644
--- a/gnomad/sample_qc/ancestry.py
+++ b/gnomad/sample_qc/ancestry.py
@@ -293,4 +293,4 @@ def run_pca_with_relateds(
         )
         related_scores = pc_project(related_mt, pca_loadings)
         pca_scores = pca_scores.union(related_scores)
-        return pca_evals, pca_scores, pca_loadings
\ No newline at end of file
+        return pca_evals, pca_scores, pca_loadings

From d1ae2cf288a533b15a271a52c3fa60b7f314f499 Mon Sep 17 00:00:00 2001
From: klaricch <kristen@broadinstitute.org>
Date: Tue, 17 May 2022 13:52:25 -0400
Subject: [PATCH 5/6] PR suggestions

---
 gnomad/sample_qc/ancestry.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py
index cad8d2778..729e01aa4 100644
--- a/gnomad/sample_qc/ancestry.py
+++ b/gnomad/sample_qc/ancestry.py
@@ -148,7 +148,7 @@ def assign_population_pcs(
         can be used to expand this column into multiple `PC` columns.
 
     :param pop_pca_scores: Input Hail Table or Pandas Dataframe
-    :param pc_cols: List of which PCS to use/columns storing the PCs to use (i.e. [1,2,4,5])
+    :param pc_cols: List of which PCs to use/columns storing the PCs to use. Values provided should be 1-based and should be a list of integers when passing in a Hail Table (i.e. [1, 2, 4, 5]) or a list of strings when passing in a Pandas Dataframe (i.e. ["PC1", "PC2", "PC4", "PC5"]).
     :param known_col: Column storing the known population labels
     :param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
     :param seed: Random seed
@@ -163,6 +163,10 @@ def assign_population_pcs(
 
     hail_input = isinstance(pop_pca_scores, hl.Table)
     if hail_input:
+        if not all(isinstance(n, int) for n in pc_cols):
+            raise TypeError(
+                "Using a Hail Table with pc_cols requires all values of pc_cols list to be integers"
+            )
         pcs_to_pull = [pop_pca_scores.scores[i - 1] for i in pc_cols]
         if not fit:
             pop_pca_scores = pop_pca_scores.select(known_col, pca_scores=pcs_to_pull)

From 8d4124860fa5a2f4481fae6300fcffe4cc07f7c2 Mon Sep 17 00:00:00 2001
From: klaricch <kristen@broadinstitute.org>
Date: Tue, 17 May 2022 13:55:17 -0400
Subject: [PATCH 6/6] add type check

---
 gnomad/sample_qc/ancestry.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gnomad/sample_qc/ancestry.py b/gnomad/sample_qc/ancestry.py
index 729e01aa4..e7cf08384 100644
--- a/gnomad/sample_qc/ancestry.py
+++ b/gnomad/sample_qc/ancestry.py
@@ -165,7 +165,7 @@ def assign_population_pcs(
     if hail_input:
         if not all(isinstance(n, int) for n in pc_cols):
             raise TypeError(
-                "Using a Hail Table with pc_cols requires all values of pc_cols list to be integers"
+                "Using a Hail Table with pc_cols requires all values of the pc_cols list to be integers"
             )
         pcs_to_pull = [pop_pca_scores.scores[i - 1] for i in pc_cols]
         if not fit:
@@ -180,6 +180,10 @@ def assign_population_pcs(
         pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())
 
     else:
+        if not all(isinstance(n, str) for n in pc_cols):
+            raise TypeError(
+                "Using a Pandas DataFrame with pc_cols requires all values of the pc_cols list to be strings"
+            )
         pop_pc_pd = pop_pca_scores
 
     # Split training data into subsamples for fitting and evaluating