sample_level_processing.R

## ------------------------------------
##
## Script name: single-cell data sample level processing
##
## Purpose of script: Ambient RNA removal using SoupX, sample-level bad-quality cell filtering using Seurat, and doublet removal using DoubletFinder
##
## Author: Jiekun (Jackie) Yang
##
## Date Created: 2022-07-11
##
## Email: jkyang@mit.edu
##
##--------------------------------------

### cellranger_input_file is a tab delimited file with three columns: project_folder_path, flowcell_ID, and library_ID
### species takes in mouse or human
### soupx_b takes in TRUE or FALSE, if you want SoupX decontamination or not
### seurat_filt_b takes in TRUE or FALSE, if you want bad-quality cell filtering using Seurat or not
### if seurat_filt_b == TRUE, nfeature_low, nfeature_high, ncount_low, and per_mt_high takes in numbers for the lower cutoff of number of molecules/UMIs, higher cutoff of number of molecules/UMIs, lower cutoff of number of genes, and higher cutoff of percentage of mitochondrial reads
### norm_mode takes in regular (normalize to the median number of genes) or sctransform
### pc_num takes in the number of PCs to use for Seurat FindNeighbors and DoubletFinder paramSweep_v3
### clust_res take in the resolution for Seurat FindClusters
### doublet_b takes in TRUE or FALSE, if you want DoubletFinder or not
### target_folder is where you would like to store output files generated by this function

## library setup
library(SoupX)
library(Seurat)
options(future.globals.maxSize = 128000 * 1024^2)
library(future)
library(future.apply)
plan("multiprocess", workers = 16)
library(DoubletFinder)

## function
sample_level_process <- function(cellranger_input_file = NULL, species = NULL, soupx_b = TRUE, seurat_filt_b = FALSE, nfeature_low = 0, nfeature_high = Inf, ncount_low = 0, per_mt_high = 100, norm_mode = c("regular", "sctransform"), pc_num = 20, clust_res = 0.4, doublet_b = TRUE, target_folder = NULL) {
    temp.pwd <- getwd()
    cellranger_input <- read.table(cellranger_input_file, header = F, stringsAsFactors = F)
    sample.stat <- data.frame()
    setwd(target_folder)
    if (soupx_b == TRUE) {
        soupx_folder <- paste0(target_folder, "/soupx")
        if (!dir.exists("soupx")) {
            dir.create("soupx")
        }
    }
    seurat_plot_folder <- paste0(target_folder, "/seurat_plots")
    if (!dir.exists("seurat_plots")) {
        dir.create("seurat_plots")
    }
    for (temp.row in 1:nrow(cellranger_input)) {
        temp.projPATH <- cellranger_input[temp.row, 1]
        temp.flowID <- cellranger_input[temp.row, 2]
        temp.lib <- cellranger_input[temp.row, 3]
        print(temp.lib)
        temp.path <- paste0(temp.projPATH, "/", temp.flowID, "/", temp.lib, "/outs")
        sample.stat[temp.row, "library_id"] <- temp.lib
        if (soupx_b == TRUE) {
            for (temp.mode in c("auto", "fixed_0.1", "fixed_0.15", "fixed_0.2", "manual")) {
                print(paste0("SoupX:", temp.mode))
                if (temp.mode == "manual") {
                    if (species == "mouse") {
                        hbGenes <- c("Hbb-bt", "Hbb-bs", "Hbb-bh2", "Hbb-bh1", "Hbb-y", "Hba-x", "Hba-a1", "Hba-a2")
                        igGenes <- c("Igha", "Ighe", "Ighg2c", "Ighg2b", "Ighg1", "Ighg3", "Ighd", "Ighm", "Ighj4", "Ighj3", "Ighj2", "Ighj1")
                    } else if (species == "human") {
                        hbGenes <- c("HBB-BT", "HBB-BS", "HBB-BH2", "HBB-BH1", "HBB-Y", "HBA-X", "HBA-A1", "HBA-A2")
                        igGenes <- c("IGHA", "IGHE", "IGHG2C", "IGHG2B", "IGHG1", "IGHG3", "IGHD", "IGHM", "IGHJ4", "IGHJ3", "IGHJ2", "IGHJ1")
                    }
                nonExpressedGeneList <- list(HB = hbGenes, IG = igGenes)
                }
                temp.df <- load10X(temp.path)
                sample.stat[temp.row, "cell_num_after_cellranger_count"] <- ncol(temp.df$toc)
                if (temp.mode == "auto") {
                    temp.df <- autoEstCont(temp.df, doPlot = FALSE)
                    sample.stat[temp.row, "auto_contamination_rate"] <- temp.df$fit$rhoEst
                } else if (grepl("fixed", temp.mode)) {
                    temp.cr <- as.numeric(gsub(".*_", "", temp.mode))
                    temp.df <- setContaminationFraction(temp.df, temp.cr)
                    sample.stat[temp.row, paste0(temp.mode, "_contamination_rate")] <- temp.cr
                } else if (temp.mode == "manual") {
                    useToEst <- estimateNonExpressingCells(temp.df, nonExpressedGeneList = nonExpressedGeneList, clusters = FALSE)
                    # desirable to be aggresive: not using clustering info
                    temp.df <- calculateContaminationFraction(temp.df, nonExpressedGeneList, useToEst = useToEst)
                    sample.stat[temp.row, "manual_contamination_rate"] <-unique(temp.df$metaData$rho)
                }
                saveRDS(temp.df, paste0(soupx_folder, "/", temp.lib, "_", temp.mode, ".rds"))
                temp.df.out <- adjustCounts(temp.df, roundToInt=TRUE)
            }
        } else if (soupx_b == FALSE) {
            temp.df.out <- Read10X(data.dir = paste0(temp.path, "/filtered_feature_bc_matrix/"))
            sample.stat[temp.row, "cell_num_after_cellranger_count"] <- ncol(temp.df.out)
        }
        print("Seurat")
        colnames(temp.df.out) <- paste0(temp.lib, "_", colnames(temp.df.out))
        temp.data <- CreateSeuratObject(counts = temp.df.out)
        if (species == "mouse") {
            temp.data[["percent.mt"]] <- PercentageFeatureSet(temp.data, pattern = "^mt-")
        } else if (species == "human") {
            temp.data[["percent.mt"]] <- PercentageFeatureSet(temp.data, pattern = "^MT-")
        }
        if (seurat_filt_b == TRUE) {
            sample.stat[temp.row, "nFeature_RNA_low"] <- nfeature_low
            sample.stat[temp.row, "nFeature_RNA_high"] <- nfeature_high
            sample.stat[temp.row, "nCount_RNA_low"] <- ncount_low
            sample.stat[temp.row, "percent_MT_high"] <- per_mt_high
            temp.data <- subset(temp.data, subset = nFeature_RNA > nfeature_low & nFeature_RNA < nfeature_high & nCount_RNA > ncount_low & percent.mt < per_mt_high)
            sample.stat[temp.row, "cell_num_after_seurat_filt_count"] <- ncol(temp.data)
        }
        sample.stat[temp.row, "normalization_method"] <- norm_mode
        if (norm_mode == "regular") {
            temp.data <- NormalizeData(temp.data, scale.factor = median(temp.data$nCount_RNA))
            temp.data <- ScaleData(temp.data, verbose = FALSE)
        } else if (norm_mode == "sctransform") {
            temp.data <- SCTransform(temp.data, verbose = FALSE)
        }
        png(paste0(seurat_plot_folder, "/", temp.lib, "_violin.png"), width = 600, height = 300)
        print(VlnPlot(temp.data, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3))
        dev.off()
        png(paste0(seurat_plot_folder, "/", temp.lib, "_scatter.png"), width = 600, height = 300)
        plot1 <- FeatureScatter(temp.data, feature1 = "nCount_RNA", feature2 = "percent.mt")
        plot2 <- FeatureScatter(temp.data, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
        print(plot1 + plot2)
        dev.off()
        temp.data <- FindVariableFeatures(temp.data)
        temp.data <- RunPCA(temp.data, features = VariableFeatures(temp.data)) # default 50 pcs
        png(paste0(seurat_plot_folder, "/", temp.lib, "_elbow.png"), width = 300, height = 300)
        print(ElbowPlot(temp.data))
        dev.off()
        temp.data <- RunUMAP(temp.data, reduction = "pca", dims = 1:pc_num)
        sample.stat[temp.row, "number_of_PCs_used"] <- pc_num
        temp.data <- FindNeighbors(temp.data, reduction = "pca", dims = 1:pc_num)
        sample.stat[temp.row, "clustering_resolution"] <- clust_res
        temp.data <- FindClusters(temp.data, resolution = clust_res)
        if (doublet_b == TRUE) {
            print("DoubletFinder")
            ## pK Identification (no ground-truth)
            if (norm_mode == "sctransform") {
                temp.sweep.res.list <- paramSweep_v3(temp.data, PCs = 1:pc_num, sct = TRUE)
            } else {
                temp.sweep.res.list <- paramSweep_v3(temp.data, PCs = 1:pc_num)
            }
            temp.sweep.stats <- summarizeSweep(temp.sweep.res.list, GT = FALSE)
            temp.bcmvn <- find.pK(temp.sweep.stats)
            temp.pK_value <- as.numeric(as.character(temp.bcmvn$pK[temp.bcmvn$BCmetric == max(temp.bcmvn$BCmetric)]))
            ## Homotypic Doublet Proportion Estimate
            temp.homotypic.prop <- modelHomotypic(temp.data@meta.data[, grepl("snn_res", colnames(temp.data@meta.data))])
            temp.nExp_poi <- round(0.031*ncol(temp.data))
            temp.nExp_poi.adj <- round(temp.nExp_poi*(1-temp.homotypic.prop))
            ## Run DoubletFinder with varying classification stringencies
            temp.pN_value <- 0.25
            temp.pANN_value <- paste0("pANN_",temp.pN_value,"_",temp.pK_value,'_',temp.nExp_poi)
            temp.data <- doubletFinder_v3(temp.data, PCs = 1:pc_num, pN = temp.pN_value, pK = temp.pK_value, nExp = temp.nExp_poi, reuse.pANN = FALSE, sct = TRUE)
            temp.data <- doubletFinder_v3(temp.data, PCs = 1:pc_num, pN = temp.pN_value, pK = temp.pK_value, nExp = temp.nExp_poi.adj, reuse.pANN = temp.pANN_value, sct = TRUE)
            temp.data$Doublet <- temp.data@meta.data[,paste0("DF.classifications_",temp.pN_value,"_",temp.pK_value,'_',temp.nExp_poi.adj)]
            sample.stat[temp.row, "cell_num_after_df"] <- summary(as.factor(temp.data@meta.data[,paste0("DF.classifications_",temp.pN_value,"_",temp.pK_value,'_',temp.nExp_poi.adj)]))[2]
        }
        saveRDS(temp.data, paste0(temp.lib, "_seurat.rds"))
    }
    write.table(sample.stat, paste0("sample_qc_metrics.txt"), sep = "\t", col.names = T, row.names = F, quote = F)
    setwd(temp.pwd)
}