Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change type of "pc_cols" param in ancestry function from hl.expr.ArrayExpression to List[int] to help track PCs that were used in RF model #448

Merged
merged 6 commits into from
May 17, 2022
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions gnomad/sample_qc/ancestry.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def pc_project(
def assign_population_pcs(
pop_pca_scores: Union[hl.Table, pd.DataFrame],
pc_cols: Union[hl.expr.ArrayExpression, List[str]],
pc_names: List[int],
known_col: str = "known_pop",
fit: Any = None, # Type should be RandomForestClassifier but we do not want to import sklearn.RandomForestClassifier outside
seed: int = 42,
Expand Down Expand Up @@ -149,6 +150,7 @@ def assign_population_pcs(

:param pop_pca_scores: Input Hail Table or Pandas Dataframe
:param pc_cols: Columns storing the PCs to use
:param pc_names: List of integers to use for naming the selected PCs (i.e. an input of [1, 3] will result in the first two PCs of pc_cols being named PC1 and PC3)
klaricch marked this conversation as resolved.
Show resolved Hide resolved
:param known_col: Column storing the known population labels
:param fit: Fit from a previously trained random forest model (i.e., the output from a previous RandomForestClassifier() call)
:param seed: Random seed
Expand All @@ -172,10 +174,8 @@ def assign_population_pcs(

# Explode the PC array
num_out_cols = min([len(x) for x in pop_pc_pd["pca_scores"].values.tolist()])
klaricch marked this conversation as resolved.
Show resolved Hide resolved
pc_cols = [f"PC{i+1}" for i in range(num_out_cols)]
pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())[
list(range(num_out_cols))
]
pc_cols = [f"PC{i}" for i in pc_names]
pop_pc_pd[pc_cols] = pd.DataFrame(pop_pc_pd["pca_scores"].values.tolist())

else:
pop_pc_pd = pop_pca_scores
Expand Down Expand Up @@ -230,8 +230,14 @@ def assign_population_pcs(

if hail_input:
pops_ht = hl.Table.from_pandas(pop_pc_pd, key=list(pop_pca_scores.key))
pops_ht.annotate_globals(
assign_pops_from_pc_params=hl.struct(min_assignment_prob=min_prob)
pops_ht = pops_ht.annotate_globals(
klaricch marked this conversation as resolved.
Show resolved Hide resolved
assign_pops_from_pc_params=hl.struct(
min_assignment_prob=min_prob, error_rate=error_rate
)
)
pops_ht = pops_ht.annotate(
evaluation_sample=hl.literal(list(evaluate_fit.s)).contains(pops_ht.s),
training_sample=hl.literal(list(train_fit.s)).contains(pops_ht.s),
)
return pops_ht, pop_clf
else:
Expand Down