run_ensembleCNV_template.sh

#!/bin/bash

## a template of step-by-step scripts to run ensembleCNV

## NOTE:
## scripts embraced by "<>" need to be specified for the project
##

## following instructions to install ensembleCNV, iPattern, PennCNV and QuantiSNP.
## assume the installation folder
ENSEMBLECNV=</path/to/ensembleCNV>
IPNBASE=</path/to/iPattern>
PENNCNV=</path/to/PennCNV>
QUANTISNP=</path/to/QuantiSNP>

## project name used in running iPattern
PROJECT_NAME=<project_name>

## working directory for a new project
WKDIR=</path/to/working_directory>

## create a new project
cd $ENSEMBLECNV
chmod +x create_new_project.sh

./create_new_project.sh $WKDIR

## put input data in ${WKDIR}/data

## 1 Initial call =============================================================

### Prepare chromosome-wise LRR and BAF matrices for CNV genotyping -----------

#### (1) Create LRR and BAF (tab delimited) matrices from final report
perl ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/finalreport_to_matrix_LRR_and_BAF.pl \
${WKDIR}/data/final_report.txt \
${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF

#### (2) Tansform tab-delimited text file to .rds format for quick loading in R
Rscript ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/transform_from_tab_to_rds.R \
--input ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF \
--output ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS \

### Prepare data for individual CNV callers -----------------------------------
#### iPattern
perl ${WKDIR}/01_initial_call/prepare_IPQ_input_file/finalreport_to_iPattern.pl \
-prefix ${WKDIR}/01_initial_call/run_iPattern/data/ \
-suffix .txt \
${WKDIR}/data/final_report.txt

#### PennCNV
perl ${WKDIR}/01_initial_call/prepare_IPQ_input_file/finalreport_to_PennCNV.pl \
-prefix ${WKDIR}/01_initial_call/run_PennCNV/data/ \
-suffix .txt \
${WKDIR}/data/final_report.txt

#### QuantiSNP
perl ${WKDIR}/01_initial_call/prepare_IPQ_input_file/finalreport_to_QuantiSNP.pl \
-prefix ${WKDIR}/01_initial_call/run_QuantiSNP/data/ \
-suffix .txt \
${WKDIR}/data/final_report.txt


## run_iPattern ---------------------------------------------------------------
#IPNBASE=</path/to/iPattern>
#PROJECT_NAME=<project_name> 

#### (1) Prepare auxiliary input files
Rscript ${WKDIR}/01_initial_call/run_iPattern/prepare_input_files_for_iPattern.R ${WKDIR} ${PROJECT_NAME}

## Note: 
## `${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_bad_samples.txt`: 
## We prepared an empty file where the users can type in the sample IDs to be excluded from the analysis 
## if there is any.

#### (2) Run iPattern
${IPNBASE}/ipn_0.581/preprocess/ilmn/ilmn_run.py \
--data-file-list   ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_data_file.txt \
--gender-file      ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_gender_file.txt \
--bad-sample-file  ${WKDIR}/01_initial_call/run_iPattern/data_aux/${PROJECT_NAME}_bad_samples.txt \
--experiment       $PROJECT_NAME \
--output-directory ${WKDIR}/01_initial_call/run_iPattern/results/ \
--do-log \
--do-cleanup \
--noqsub


## run_PennCNV ----------------------------------------------------------------
#PENNCNV=</path/to/penncnv>

#### (1) Prepare SNP.pfb and SNP.gcmodel files
#### (1.1) compile pfb (population frequency of B allele) file
perl ${PENNCNV}/bin/compile_pfb.pl \
-snpposfile ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/SNP_pos.txt \
-listfile ${WKDIR}/01_initial_call/run_PennCNV/data_aux/list_pfb.txt \
-output ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb

## NOTE:
## "list_pfb.txt": the users need to prepare a text file that contains a list of full path to signal files in "{WKDIR}/01_initial_call/run_PennCNV/data" generated by "finalreport_to_PennCNV.pl" in the initial step, one per line. The pfb file compiled from only a few samples is not valid -- at least about 100 samples (e.g. one 96-well plate of samples) are needed. Based on our experience, if the sample size of the projects is very large, the users do not need to use signal files from all samples. Instead, a subset of 300 to 500 samples from unrelated subjects are good enough to estimate PFB (population frequency of B allele) for the project. Please put the prepared "list_pfb.txt" in the directory "${WKDIR}/01_initial_call/run_PennCNV/data_aux".

#### (1.2) compile gcmodel file for GC content ajdustment
## NOTE:
## The "gc5Base_hg19.txt.sorted" (take hg19 for example) is generated based on UCSC Genome Browser annotation file (http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gc5Base.txt.gz). Despite the file name, it actually contains GC content per 5120bp. If you need GC annotation file for other genome assembly, please download it from the corresponding directory names. After downloading "gc5Base.txt.gz" (put in `${WKDIR}/01_initial_call/run_PennCNV/data_aux`) and unzipping the file, then sort this file such that chromosome and positions are sorted.
sort -k 2,2 -k 3,3n \
<${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base.txt \
>${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base_hg19.txt.sorted

perl ${PENNCNV}/bin/cal_gc_snp.pl \
${WKDIR}/01_initial_call/run_PennCNV/data_aux/gc5Base_hg19.txt.sorted \
${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \
-output ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel

#### (2) Run PennCNV for each sample in parallel (through job submitting system on cluster)
## NOTE: In "step.2.run.PennCNV.jobs.R", the scripts regarding job submission embraced by "##<<<... ##>>>..." need to be specified based on your system.
Rscript ${WKDIR}/01_initial_call/run_PennCNV/step.2.run.PennCNV.jobs.R \
--penncnv ${PENNCNV} \
--data ${WKDIR}/01_initial_call/run_PennCNV/data \
--wkdir ${WKDIR}/01_initial_call/run_PennCNV/results \
--pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \
--gcmodel ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel \
--hmm ${PENNCNV}/lib/hhall.hmm

#### (3) Check job status and resubmit failed jobs
## NOTE: In "step.3.check.PennCNV.jobs.R", the scripts regarding job submission embraced by "##<<<... ##>>>..." need to be specified based on your system.
Rscrip ${WKDIR}/01_initial_call/run_PennCNV/step.3.check.PennCNV.jobs.R \
--penncnv ${PENNCNV} \
--data ${WKDIR}/01_initial_call/run_PennCNV/data/ \
--wkdir ${WKDIR}/01_initial_call/run_PennCNV/results \
--pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb \
--gcmodel ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.gcmodel \
--hmm ${PENNCNV}/lib/hhall.hmm

#### (4) Combine PennCNV results (.rawcnv and .log files) from each sample
perl ${WKDIR}/01_initial_call/run_PennCNV/step.4.combine.PennCNV.res.pl \
--in_dir ${WKDIR}/01_initial_call/run_PennCNV/results/res \
--out_dir ${WKDIR}/01_initial_call/run_PennCNV/results

#### (5) Merge closely adjacent CNVs and generate final results
Rscript ${WKDIR}/01_initial_call/run_PennCNV/step.5.clean.PennCNV.res.R \
--penncnv ${PENNCNV} \
--input ${WKDIR}/01_initial_call/run_PennCNV/results \
--pfb ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb


## run_QuantiSNP --------------------------------------------------------------
QUANTISNP=</path/to/QuantiSNP>

#### (1) Run QuantiSNP for each sample in parallel (through job submitting system)
Rscript ${WKDIR}/01_initial_call/run_QuantiSNP/step.1.prepare.QuantiSNP.R \
--quantisnp ${QUANTISNP} \
--data ${WKDIR}/01_initial_call/run_QuantiSNP/data \
--sample ${WKDIR}/data/Samples_Table.txt \
--result ${WKDIR}/01_initial_call/run_QuantiSNP/results/res

#### (2) Check job status and resubmit unfinishing jobs
Rscript ${WKDIR}/01_initial_call/run_QuantiSNP/step.2.check.QuantiSNP.R \
--quantisnp ${QUANTISNP} \
--data ${WKDIR}/01_initial_call/run_QuantiSNP/data \
--sample ${WKDIR}/data/Samples_Table.txt \
--result ${WKDIR}/01_initial_call/run_QuantiSNP/results/res

#### (3) Combine PennCNV results from each sample, including the content in ".cnv" files
perl ${WKDIR}/01_initial_call/run_QuantiSNP/step.3.combine.QuantiSNP.pl \
--in_dir ${WKDIR}/01_initial_call/run_QuantiSNP/results/res \
--out_dir ${WKDIR}/01_initial_call/run_QuantiSNP/results


## 2 Batch effect =============================================================

### PCA on raw LRR data -------------------------------------------------------
#### (1) Randomly select 100,000 SNPs based on information from "SNP_pos.txt".
Rscript ${WKDIR}/02_batch_effect/PCA_on_LRR/step.1.down.sampling.R \
${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/SNP_pos.txt \
${WKDIR}/02_batch_effect/PCA_on_LRR

#### (2) Extract LRR values at randomly selected SNPs across individuals from final report
perl ${WKDIR}/02_batch_effect/PCA_on_LRR/step.2.LRR.matrix.pl \
${WKDIR}/02_batch_effect/PCA_on_LRR/snps.down.sample.txt \
${WKDIR}/data/final_report.txt \
${WKDIR}/02_batch_effect/PCA_on_LRR/LRR_matrix_for_PCA.txt

#### (3) PCA on LRR matrix
Rscript ${WKDIR}/02_batch_effect/PCA_on_LRR/step.3.LRR.PCA.R \
${WKDIR}/02_batch_effect/PCA_on_LRR/ \
${WKDIR}/02_batch_effect/PCA_on_LRR/LRR_matrix_for_PCA.txt

### PCA on summary statistics -------------------------------------------------
#### (1) Generate iPattern, PennCNV and QuantiSNP sample-level summary statistics
Rscript ${WKDIR}/02_batch_effect/PCA_on_summary_stats/step.1.prepare.stats.R \
${WKDIR}/01_initial_call/run_iPattern/results \
${WKDIR}/01_initial_call/run_PennCNV/results \
${WKDIR}/01_initial_call/run_QuantiSNP/results \
${WKDIR}/02_batch_effect/PCA_on_summary_stats

#### (2) PCA on sample-level summary statistics
Rscript ${WKDIR}/02_batch_effect/PCA_on_summary_stats/step.2.stats.PCA.R \
${WKDIR}/02_batch_effect/PCA_on_summary_stats


## 3 Create CNVR ==============================================================

#### (1) Extract CNV information from individual calls made by iPattern, PennCNV and QuantiSNP
###<<<<<< specify <project_name> used in iPattern analysis >>>>>>###
#PROJECT_NAME=<project_name>

Rscript ${WKDIR}/03_create_CNVR/step.1.CNV.data.R \
${WKDIR}/03_create_CNVR \
${WKDIR}/01_initial_call/run_iPattern/results/${PROJECT_NAME}_all_calls.txt \
${WKDIR}/01_initial_call/run_PennCNV/results/CNV.PennCNV_new.txt \
${WKDIR}/01_initial_call/run_QuantiSNP/results/quantisnp.cnv \
${WKDIR}/data/Samples_Table.txt

#### (2) Merge CNV calls from individual methods into CNVRs
Rscript ${WKDIR}/03_create_CNVR/step.2.create.CNVR.R \
--icnv ${WKDIR}/03_create_CNVR/cnv.ipattern.txt \
--pcnv ${WKDIR}/03_create_CNVR/cnv.penncnv.txt \
--qcnv ${WKDIR}/03_create_CNVR/cnv.quantisnp.txt \
--snp ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/SNP_pos.txt \
--centromere ${WKDIR}/data/centromere_hg19.txt


## 4 CNV genotyping for each CNVR =============================================

#### link necessary input data in the directory: ${WKDIR}/04_CNV_genotype/data/
ln -s ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb ${WKDIR}/04_CNV_genotype/data/SNP.pfb
ln -s ${WKDIR}/03_create_CNVR/cnvr_clean.txt ${WKDIR}/04_CNV_genotype/data/cnvr_clean.txt
ln -s ${WKDIR}/03_create_CNVR/cnv_clean.txt ${WKDIR}/04_CNV_genotype/data/cnv_clean.txt
ln -s ${WKDIR}/01_initial_call/run_PennCNV/results/CNV.PennCNV_qc_new.txt ${WKDIR}/04_CNV_genotype/data/sample_QC.txt
## duplicate_pairs.txt is optional
ln -s ${WKDIR}/data/duplicate_pairs.txt ${WKDIR}/04_CNV_genotype/data/duplicate_pairs.txt

#### (1) Split CNVRs into different batches
Rscript ${WKDIR}/04_CNV_genotype/step.1.split.cnvrs.into.batches.R \
-i ${WKDIR}/04_CNV_genotype/data/cnvr_clean.txt \
-o ${WKDIR}/04_CNV_genotype/data/cnvr_batch.txt \
-n 200

#### (2) Submit parallelized jobs for CNV genotyping, each corresponding to one batch
Rscript ${WKDIR}/04_CNV_genotype/step.2.submit.jobs.R \
--type 0 \
--script ${WKDIR}/04_CNV_genotype \
--sourcefile ${WKDIR}/04_CNV_genotype/scripts \
--datapath ${WKDIR}/04_CNV_genotype/data \
--matrixpath ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS \
--resultpath ${WKDIR}/04_CNV_genotype/results \
--joblog ${WKDIR}/04_CNV_genotype/results \
--duplicates \
--plot

#### (3) Check submitted jobs and resubmit failed jobs
Rscript ${WKDIR}/04_CNV_genotype/step.3.check.and.resubmit.jobs.R \
--flag 1 \
--script ${WKDIR}/04_CNV_genotype \
--sourcefile ${WKDIR}/04_CNV_genotype/scripts \
--datapath ${WKDIR}/04_CNV_genotype/data \
--matrixpath ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS \
--resultpath ${WKDIR}/04_CNV_genotype/results \
--joblog ${WKDIR}/04_CNV_genotype/results \
--duplicates \
--plot

#### (4) Combine results from parallelized jobs
Rscript ${WKDIR}/04_CNV_genotype/step.4.prediction.results.R \
--datapath ${WKDIR}/04_CNV_genotype/data \
--resultpath ${WKDIR}/04_CNV_genotype/results


## 5 Boundary refinement ======================================================

#### link necessary input data in the directory: ${WKDIR}/05_boundary_refinement/data
ln -s ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/SNP_pos.txt ${WKDIR}/05_boundary_refinement/data/SNP_pos.txt
ln -s ${WKDIR}/04_CNV_genotype/results/cnvr_genotype.txt ${WKDIR}/05_boundary_refinement/datacnvr_genotype.txt
ln -s ${WKDIR}/04_CNV_genotype/results/matrix_CN.rds ${WKDIR}/05_boundary_refinement/data/matrix_CN.rds
ln -s ${WKDIR}/04_CNV_genotype/results/matrix_GQ.rds ${WKDIR}/05_boundary_refinement/data/matrix_GQ.rds

#### (1) Select CNVRs with common CNV genotype to be refined.
Rscript ${WKDIR}/05_boundary_refinement/step.1.common.CNVR.to.refine.R \
--datapath ${WKDIR}/05_boundary_refinement/data \
--resultpath ${WKDIR}/05_boundary_refinement/results \
--freq 0.05

#### (2) Submit parallelized jobs for boundary refinement, each corresponding to CNVRs in one chromosome
Rscript ${WKDIR}/05_boundary_refinement/step.2.submit.jobs.R \
--refinescript ${WKDIR}/05_boundary_refinement/CNVR.boundary.refinement.R \
--rcppfile ${WKDIR}/05_boundary_refinement/refine.cpp \
--datapath ${WKDIR}/05_boundary_refinement/data \
--matrixpath ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS \
--resultpath ${WKDIR}/05_boundary_refinement/results \
--centromere ${WKDIR}/data/centromere_hg19.txt \
--plot

#### (3) Combine results from parallelized jobs
Rscript ${WKDIR}/05_boundary_refinement/step.3.clean.results.R \
--resultpath ${WKDIR}/05_boundary_refinement/results


##<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
## 05a_regenotype_after_refinement --------------------------------------------
## The CNVRs listed in "cnvr_regenotype_after_refine.txt" will go through CNV genotyping
## as done in 04_CNV_genotype

#### link necessary input data in the directory: ${WKDIR}/05a_regenotype_after_refinement/data/
ln -s ${WKDIR}/01_initial_call/run_PennCNV/data_aux/SNP.pfb ${WKDIR}/05a_regenotype_after_refinement/data/SNP.pfb
ln -s ${WKDIR}/05_boundary_refinement/results/cnvr_regenotype_after_refine.txt ${WKDIR}/05a_regenotype_after_refinement/data/cnvr_clean.txt
ln -s ${WKDIR}/03_create_CNVR/cnv_clean.txt ${WKDIR}/05a_regenotype_after_refinement/data/cnv_clean.txt
ln -s ${WKDIR}/01_initial_call/run_PennCNV/results/CNV.PennCNV_qc_new.txt ${WKDIR}/05a_regenotype_after_refinement/data/sample_QC.txt
## duplicate_pairs.txt is optional
ln -s ${WKDIR}/data/duplicate_pairs.txt ${WKDIR}/05a_regenotype_after_refinement/data/duplicate_pairs.txt

## the scripts for regenotyping is the same as those used in ${WKDIR}/04_CNV_genotype
#### (1) Split CNVRs into different batches
Rscript ${WKDIR}/04_CNV_genotype/step.1.split.cnvrs.into.batches.R \
-i ${WKDIR}/05a_regenotype_after_refinement/data/cnvr_clean.txt \
-o ${WKDIR}/05a_regenotype_after_refinement/data/cnvr_batch.txt \
-n 200

#### (2) Submit parallelized jobs for CNV genotyping, each corresponding to one batch
Rscript ${WKDIR}/04_CNV_genotype/step.2.submit.jobs.R \
--type 0 \
--script ${WKDIR}/04_CNV_genotype \
--sourcefile ${WKDIR}/04_CNV_genotype/scripts \
--datapath ${WKDIR}/05a_regenotype_after_refinement/data \
--matrixpath ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS \
--resultpath ${WKDIR}/05a_regenotype_after_refinement/results \
--joblog ${WKDIR}/05a_regenotype_after_refinement/results \
--duplicates \
--plot

#### (3) Check submitted jobs and resubmit failed jobs
Rscript ${WKDIR}/04_CNV_genotype/step.3.check.and.resubmit.jobs.R \
--flag 1 \
--script ${WKDIR}/04_CNV_genotype \
--sourcefile ${WKDIR}/04_CNV_genotype/scripts \
--datapath ${WKDIR}/05a_regenotype_after_refinement/data \
--matrixpath ${WKDIR}/01_initial_call/finalreport_to_matrix_LRR_and_BAF/RDS \
--resultpath ${WKDIR}/05a_regenotype_after_refinement/results \
--joblog ${WKDIR}/05a_regenotype_after_refinement/results \
--duplicates \
--plot

#### (4) Combine results from parallelized jobs
Rscript ${WKDIR}/04_CNV_genotype/step.4.prediction.results.R \
--datapath ${WKDIR}/05a_regenotype_after_refinement/data \
--resultpath ${WKDIR}/05a_regenotype_after_refinement/results
##>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


#### (4) Update CN and GQ matrices as well as CNVR information
Rscript ${WKDIR}/05_boundary_refinement/step.4.update.genotype.matrix.R \
--matrixbeforerefine ${WKDIR}/05_boundary_refinement/data \
--matrixrefine ${WKDIR}/05a_regenotype_after_refinement/results \
--refinepath ${WKDIR}/05_boundary_refinement/results \
--output ${WKDIR}/05_boundary_refinement/results


## 6 Performance assessment ===================================================

#### (1) Evaluate concordance rate of CNV calls between technical duplicates 
#### as well as sample-wise and CNVR-wise call rates
#### This step is optional
Rscript ${WKDIR}/06_performance_assessment/step.1.performance.assessment.R \
--duplicates ${WKDIR}/data/duplicate_pairs.txt \
--matrixCN ${WKDIR}/05_boundary_refinement/results/matrix_CN_final.rds \
--matrixGQ ${WKDIR}/05_boundary_refinement/results/matrix_GQ_final.rds \
--resultpath ${WKDIR}/06_performance_assessment

## (2) Set GQ score threshold to generate final results.
## choose <GQ_score_threhold> based on evaluation in step (1) or empirically based on previous studies
GQCUTOFF=<GQ_score_threhold>

Rscript ${WKDIR}/06_performance_assessment/step.2.set.GQ.generate.results.R \
--matrixCN ${WKDIR}/05_boundary_refinement/results/matrix_CN_final.rds \
--matrixGQ ${WKDIR}/05_boundary_refinement/results/matrix_GQ_final.rds \
--cnvrfile ${WKDIR}/05_boundary_refinement/results/cnvr_final.txt \
--resultpath ${WKDIR}/06_performance_assessment \
--gqscore ${GQCUTOFF}