-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsample_level_processing.R
161 lines (158 loc) · 9.67 KB
/
sample_level_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
## ------------------------------------
##
## Script name: single-cell data sample level processing
##
## Purpose of script: Ambient RNA removal using SoupX, sample-level bad-quality cell filtering using Seurat, and doublet removal using DoubletFinder
##
## Author: Jiekun (Jackie) Yang
##
## Date Created: 2022-07-11
##
## Email: jkyang@mit.edu
##
##--------------------------------------
### cellranger_input_file is a tab delimited file with three columns: project_folder_path, flowcell_ID, and library_ID
### species takes in mouse or human
### soupx_b takes in TRUE or FALSE, if you want SoupX decontamination or not
### seurat_filt_b takes in TRUE or FALSE, if you want bad-quality cell filtering using Seurat or not
### if seurat_filt_b == TRUE, nfeature_low, nfeature_high, ncount_low, and per_mt_high takes in numbers for the lower cutoff of number of molecules/UMIs, higher cutoff of number of molecules/UMIs, lower cutoff of number of genes, and higher cutoff of percentage of mitochondrial reads
### norm_mode takes in regular (normalize to the median number of genes) or sctransform
### pc_num takes in the number of PCs to use for Seurat FindNeighbors and DoubletFinder paramSweep_v3
### clust_res take in the resolution for Seurat FindClusters
### doublet_b takes in TRUE or FALSE, if you want DoubletFinder or not
### target_folder is where you would like to store output files generated by this function
## library setup
library(SoupX)
library(Seurat)
options(future.globals.maxSize = 128000 * 1024^2)
library(future)
library(future.apply)
plan("multiprocess", workers = 16)
library(DoubletFinder)
## function
sample_level_process <- function(cellranger_input_file = NULL, species = NULL, soupx_b = TRUE, seurat_filt_b = FALSE, nfeature_low = 0, nfeature_high = Inf, ncount_low = 0, per_mt_high = 100, norm_mode = c("regular", "sctransform"), pc_num = 20, clust_res = 0.4, doublet_b = TRUE, target_folder = NULL) {
temp.pwd <- getwd()
cellranger_input <- read.table(cellranger_input_file, header = F, stringsAsFactors = F)
sample.stat <- data.frame()
setwd(target_folder)
if (soupx_b == TRUE) {
soupx_folder <- paste0(target_folder, "/soupx")
if (!dir.exists("soupx")) {
dir.create("soupx")
}
}
seurat_plot_folder <- paste0(target_folder, "/seurat_plots")
if (!dir.exists("seurat_plots")) {
dir.create("seurat_plots")
}
for (temp.row in 1:nrow(cellranger_input)) {
temp.projPATH <- cellranger_input[temp.row, 1]
temp.flowID <- cellranger_input[temp.row, 2]
temp.lib <- cellranger_input[temp.row, 3]
print(temp.lib)
temp.path <- paste0(temp.projPATH, "/", temp.flowID, "/", temp.lib, "/outs")
sample.stat[temp.row, "library_id"] <- temp.lib
if (soupx_b == TRUE) {
for (temp.mode in c("auto", "fixed_0.1", "fixed_0.15", "fixed_0.2", "manual")) {
print(paste0("SoupX:", temp.mode))
if (temp.mode == "manual") {
if (species == "mouse") {
hbGenes <- c("Hbb-bt", "Hbb-bs", "Hbb-bh2", "Hbb-bh1", "Hbb-y", "Hba-x", "Hba-a1", "Hba-a2")
igGenes <- c("Igha", "Ighe", "Ighg2c", "Ighg2b", "Ighg1", "Ighg3", "Ighd", "Ighm", "Ighj4", "Ighj3", "Ighj2", "Ighj1")
} else if (species == "human") {
hbGenes <- c("HBB-BT", "HBB-BS", "HBB-BH2", "HBB-BH1", "HBB-Y", "HBA-X", "HBA-A1", "HBA-A2")
igGenes <- c("IGHA", "IGHE", "IGHG2C", "IGHG2B", "IGHG1", "IGHG3", "IGHD", "IGHM", "IGHJ4", "IGHJ3", "IGHJ2", "IGHJ1")
}
nonExpressedGeneList <- list(HB = hbGenes, IG = igGenes)
}
temp.df <- load10X(temp.path)
sample.stat[temp.row, "cell_num_after_cellranger_count"] <- ncol(temp.df$toc)
if (temp.mode == "auto") {
temp.df <- autoEstCont(temp.df, doPlot = FALSE)
sample.stat[temp.row, "auto_contamination_rate"] <- temp.df$fit$rhoEst
} else if (grepl("fixed", temp.mode)) {
temp.cr <- as.numeric(gsub(".*_", "", temp.mode))
temp.df <- setContaminationFraction(temp.df, temp.cr)
sample.stat[temp.row, paste0(temp.mode, "_contamination_rate")] <- temp.cr
} else if (temp.mode == "manual") {
useToEst <- estimateNonExpressingCells(temp.df, nonExpressedGeneList = nonExpressedGeneList, clusters = FALSE)
# desirable to be aggresive: not using clustering info
temp.df <- calculateContaminationFraction(temp.df, nonExpressedGeneList, useToEst = useToEst)
sample.stat[temp.row, "manual_contamination_rate"] <-unique(temp.df$metaData$rho)
}
saveRDS(temp.df, paste0(soupx_folder, "/", temp.lib, "_", temp.mode, ".rds"))
temp.df.out <- adjustCounts(temp.df, roundToInt=TRUE)
}
} else if (soupx_b == FALSE) {
temp.df.out <- Read10X(data.dir = paste0(temp.path, "/filtered_feature_bc_matrix/"))
sample.stat[temp.row, "cell_num_after_cellranger_count"] <- ncol(temp.df.out)
}
print("Seurat")
colnames(temp.df.out) <- paste0(temp.lib, "_", colnames(temp.df.out))
temp.data <- CreateSeuratObject(counts = temp.df.out)
if (species == "mouse") {
temp.data[["percent.mt"]] <- PercentageFeatureSet(temp.data, pattern = "^mt-")
} else if (species == "human") {
temp.data[["percent.mt"]] <- PercentageFeatureSet(temp.data, pattern = "^MT-")
}
if (seurat_filt_b == TRUE) {
sample.stat[temp.row, "nFeature_RNA_low"] <- nfeature_low
sample.stat[temp.row, "nFeature_RNA_high"] <- nfeature_high
sample.stat[temp.row, "nCount_RNA_low"] <- ncount_low
sample.stat[temp.row, "percent_MT_high"] <- per_mt_high
temp.data <- subset(temp.data, subset = nFeature_RNA > nfeature_low & nFeature_RNA < nfeature_high & nCount_RNA > ncount_low & percent.mt < per_mt_high)
sample.stat[temp.row, "cell_num_after_seurat_filt_count"] <- ncol(temp.data)
}
sample.stat[temp.row, "normalization_method"] <- norm_mode
if (norm_mode == "regular") {
temp.data <- NormalizeData(temp.data, scale.factor = median(temp.data$nCount_RNA))
temp.data <- ScaleData(temp.data, verbose = FALSE)
} else if (norm_mode == "sctransform") {
temp.data <- SCTransform(temp.data, verbose = FALSE)
}
png(paste0(seurat_plot_folder, "/", temp.lib, "_violin.png"), width = 600, height = 300)
print(VlnPlot(temp.data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3))
dev.off()
png(paste0(seurat_plot_folder, "/", temp.lib, "_scatter.png"), width = 600, height = 300)
plot1 <- FeatureScatter(temp.data, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(temp.data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
print(plot1 + plot2)
dev.off()
temp.data <- FindVariableFeatures(temp.data)
temp.data <- RunPCA(temp.data, features = VariableFeatures(temp.data)) # default 50 pcs
png(paste0(seurat_plot_folder, "/", temp.lib, "_elbow.png"), width = 300, height = 300)
print(ElbowPlot(temp.data))
dev.off()
temp.data <- RunUMAP(temp.data, reduction = "pca", dims = 1:pc_num)
sample.stat[temp.row, "number_of_PCs_used"] <- pc_num
temp.data <- FindNeighbors(temp.data, reduction = "pca", dims = 1:pc_num)
sample.stat[temp.row, "clustering_resolution"] <- clust_res
temp.data <- FindClusters(temp.data, resolution = clust_res)
if (doublet_b == TRUE) {
print("DoubletFinder")
## pK Identification (no ground-truth)
if (norm_mode == "sctransform") {
temp.sweep.res.list <- paramSweep_v3(temp.data, PCs = 1:pc_num, sct = TRUE)
} else {
temp.sweep.res.list <- paramSweep_v3(temp.data, PCs = 1:pc_num)
}
temp.sweep.stats <- summarizeSweep(temp.sweep.res.list, GT = FALSE)
temp.bcmvn <- find.pK(temp.sweep.stats)
temp.pK_value <- as.numeric(as.character(temp.bcmvn$pK[temp.bcmvn$BCmetric == max(temp.bcmvn$BCmetric)]))
## Homotypic Doublet Proportion Estimate
temp.homotypic.prop <- modelHomotypic(temp.data@meta.data[, grepl("snn_res", colnames(temp.data@meta.data))])
temp.nExp_poi <- round(0.031*ncol(temp.data))
temp.nExp_poi.adj <- round(temp.nExp_poi*(1-temp.homotypic.prop))
## Run DoubletFinder with varying classification stringencies
temp.pN_value <- 0.25
temp.pANN_value <- paste0("pANN_",temp.pN_value,"_",temp.pK_value,'_',temp.nExp_poi)
temp.data <- doubletFinder_v3(temp.data, PCs = 1:pc_num, pN = temp.pN_value, pK = temp.pK_value, nExp = temp.nExp_poi, reuse.pANN = FALSE, sct = TRUE)
temp.data <- doubletFinder_v3(temp.data, PCs = 1:pc_num, pN = temp.pN_value, pK = temp.pK_value, nExp = temp.nExp_poi.adj, reuse.pANN = temp.pANN_value, sct = TRUE)
temp.data$Doublet <- temp.data@meta.data[,paste0("DF.classifications_",temp.pN_value,"_",temp.pK_value,'_',temp.nExp_poi.adj)]
sample.stat[temp.row, "cell_num_after_df"] <- summary(as.factor(temp.data@meta.data[,paste0("DF.classifications_",temp.pN_value,"_",temp.pK_value,'_',temp.nExp_poi.adj)]))[2]
}
saveRDS(temp.data, paste0(temp.lib, "_seurat.rds"))
}
write.table(sample.stat, paste0("sample_qc_metrics.txt"), sep = "\t", col.names = T, row.names = F, quote = F)
setwd(temp.pwd)
}