Skip to content

Commit

Permalink
Add capability to call SNPs on scaffolds by modifying Genotype_GVCFs
Browse files Browse the repository at this point in the history
  • Loading branch information
Aerin13 committed Apr 27, 2018
1 parent 914b037 commit e82460c
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 6 deletions.
9 changes: 7 additions & 2 deletions Config
Original file line number Diff line number Diff line change
Expand Up @@ -396,11 +396,16 @@ GVCF_LIST=
# Include the full file path.
REF_DICT=

# How many chromosomes or chromosome parts does the reference have? (integer value)
# How many chromosomes or chromosome parts does the reference have, excluding scaffolds? (integer value)
# For barley: 15 (7*2 chromosome parts + chrUn)
# For soybean: 20 (this excludes scaffolds)
# For soybean: 20
NUM_CHR=

# Optional: Include the full file path to a list of the names of any and all scaffolds or parts of the reference not covered by the chromosomes above
# One scaffold name per line, must end in .intervals, leave blank if you wish to not call SNPs on non-chromosomal sequence
# Samtools style intervals are also acceptable, one per line (ex: chr1:100-200)
CUSTOM_INTERVALS=

# What is the sample ploidy? (integer value)
# For highly inbred samples (most barleys): 1
PLOIDY=
Expand Down
16 changes: 14 additions & 2 deletions Handlers/Genotype_GVCFs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,20 @@ function Genotype_GVCFs() {
local ploidy="$6" # What is the sample ploidy?
local memory="$7" # How much memory can java use?
local dict="$8" # Where is the reference dictionary?
local maxarray="$9" # What is the maximum array index?
local scaffolds="${10}" # Where is the scaffolds intervals file?
local seqs_list=($(cut -f 2 ${dict} | grep -E '^SN' | cut -f 2 -d ':')) # Make an array of chromosome part names
local current="${seqs_list[${PBS_ARRAYID}]}" # What is the current chromosome part we're working on?
# What region of the genome are we working on currently?
if [[ "${PBS_ARRAYID}" = "${maxarray}" && ! -z "${scaffolds}" ]]
then
# If this is the last array index AND we have a scaffolds file, use the scaffolds
local current="${scaffolds}"
local out_name="custom_intervals"
else
# If this isn't the last array index OR we don't have a scaffolds file, use a chromosome part name
local current="${seqs_list[${PBS_ARRAYID}]}"
local out_name="${current}"
fi
declare -a sample_array=($(grep -E ".g.vcf" "${sample_list}")) # Put the sample list into array format
# Put the samples into a format that GATK can read
GATK_IN=()
Expand All @@ -40,7 +52,7 @@ function Genotype_GVCFs() {
"${GATK_IN[@]}" \
--heterozygosity "${heterozygosity}" \
--sample_ploidy "${ploidy}" \
-o "${out}/${current}.vcf")
-o "${out}/${out_name}.vcf")
}

# Export the function
Expand Down
5 changes: 3 additions & 2 deletions sequence_handling
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,13 @@ case "${ROUTINE}" in
checkDict "${REF_GEN}" # Check to make sure our reference genome has a dict file
if [[ "$?" -ne 0 ]]; then echo "Generating dictionary file for ${REF_GEN}..." >&2; createDict "${REF_GEN}" "${GG_MEM}" "${PICARD_JAR}"; fi # If not, generate one
SINGLE_ARRAY_LIMIT=$[${NUM_CHR} - 1] # Get the maximum number of Torque tasks (# in array - 1)
if [[ ! -z "${CUSTOM_INTERVALS}" ]]; then SINGLE_ARRAY_LIMIT="${NUM_CHR}"; fi # If we have scaffolds, add one to the maximum array index
echo "Max array index is ${SINGLE_ARRAY_LIMIT}...">&2
if [[ "${SINGLE_ARRAY_LIMIT}" -ne 0 ]]
then # If we have enough samples for a task array
echo "source ${CONFIG} && source ${SEQUENCE_HANDLING}/Handlers/Genotype_GVCFs.sh && Genotype_GVCFs ${GVCF_LIST} ${OUT_DIR} ${GATK_JAR} ${REF_GEN} ${THETA} ${PLOIDY} ${GG_MEM} ${REF_DICT}" | qsub -t 0-"${SINGLE_ARRAY_LIMIT}" -l "${GG_QSUB}" -e "${ERROR}" -o "${ERROR}" -m abe -M "${EMAIL}" -N "${PROJECT}"_Genotype_GVCFs
echo "source ${CONFIG} && source ${SEQUENCE_HANDLING}/Handlers/Genotype_GVCFs.sh && Genotype_GVCFs ${GVCF_LIST} ${OUT_DIR} ${GATK_JAR} ${REF_GEN} ${THETA} ${PLOIDY} ${GG_MEM} ${REF_DICT} ${SINGLE_ARRAY_LIMIT} ${CUSTOM_INTERVALS}" | qsub -t 0-"${SINGLE_ARRAY_LIMIT}" -l "${GG_QSUB}" -e "${ERROR}" -o "${ERROR}" -m abe -M "${EMAIL}" -N "${PROJECT}"_Genotype_GVCFs
else # If we have only one sample
echo "source ${CONFIG} && source ${SEQUENCE_HANDLING}/Handlers/Genotype_GVCFs.sh && Genotype_GVCFs ${GVCF_LIST} ${OUT_DIR} ${GATK_JAR} ${REF_GEN} ${THETA} ${PLOIDY} ${GG_MEM} ${REF_DICT}" | qsub -t 0 -l "${GG_QSUB}" -e "${ERROR}" -o "${ERROR}" -m abe -M "${EMAIL}" -N "${PROJECT}"_Genotype_GVCFs
echo "source ${CONFIG} && source ${SEQUENCE_HANDLING}/Handlers/Genotype_GVCFs.sh && Genotype_GVCFs ${GVCF_LIST} ${OUT_DIR} ${GATK_JAR} ${REF_GEN} ${THETA} ${PLOIDY} ${GG_MEM} ${REF_DICT} ${SINGLE_ARRAY_LIMIT} ${CUSTOM_INTERVALS}" | qsub -t 0 -l "${GG_QSUB}" -e "${ERROR}" -o "${ERROR}" -m abe -M "${EMAIL}" -N "${PROJECT}"_Genotype_GVCFs
fi
;;
8|Create_HC_Subset )
Expand Down

0 comments on commit e82460c

Please sign in to comment.