For the Swan manuscript, these are the data processing steps we took to process our data before using Swan. Obtaining a transcriptome via TALON is not necessary and you can use other tools that yield transcriptomes as input to Swan.
If you don't want to process the raw data from square one and want to get started using Swan please see the Getting started section.
You can download and view documentation for TALON here.
Each of the below steps should be run in your Bash terminal
mkdir data
mkdir figures
cd data/
# download reference files
wget https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/@@download/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz
gunzip GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz
awk '{print $1}' GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta > hg38.fa
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.annotation.gtf.gz
gunzip gencode.v29.annotation.gtf.gz
# hepg2 rep 1
wget https://www.encodeproject.org/files/ENCFF575ABA/@@download/ENCFF575ABA.bam
mv ENCFF575ABA.bam hepg2_1.bam
# hepg2 rep 2
wget wget https://www.encodeproject.org/files/ENCFF463JKW/@@download/ENCFF463JKW.bam
mv ENCFF463JKW.bam hepg2_2.bam
# hffc6 rep 1
wget https://www.encodeproject.org/files/ENCFF677TYB/@@download/ENCFF677TYB.bam
mv ENCFF677TYB.bam hffc6_1.bam
# hffc6 rep 2
wget https://www.encodeproject.org/files/ENCFF584UQW/@@download/ENCFF584UQW.bam
mv ENCFF584UQW.bam hffc6_2.bam
# hffc6 rep 3
wget https://www.encodeproject.org/files/ENCFF964PGS/@@download/ENCFF964PGS.bam
mv ENCFF964PGS.bam hffc6_3.bam
module load samtools
for file in *.bam;
do
sam_file=`basename $file .bam`
sam_file=${sam_file}.sam
samtools view -h $file > $sam_file
done
FASTA=hg38.fa
mkdir labelled
for file in *.sam;
do
base=`basename $file .sam`
talon_label_reads \
--f $file \
--g $FASTA \
--t 8 \
--tmpDir ${base}_temp \
--deleteTmp \
--o labelled/${base}
done
cd ../
GTF=data/gencode.v29.annotation.gtf
talon_initialize_database \
--f $GTF \
--g hg38 \
--a gencode_v29 \
--l 0 \
--idprefix TALON \
--5p 500 \
--3p 300 \
--o talon
plat="Sequel"
touch config.csv
for file in data/labelled/*.sam;
do
base=`basename $file .sam`
base="${base::${#base}-8}"
sample="${base::${#base}-2}"
printf "${base},${sample},${plat},${file}\n" >> config.csv
done
talon \
--f config.csv \
--db talon.db \
--t 16 \
--build hg38 \
--cov 0.9 \
--identity 0.8 \
--o talon_swan
talon_filter_transcripts \
--db talon.db \
-a gencode_v29 \
--datasets hepg2_1,hepg2_2 \
--maxFracA 0.5 \
--minCount 5 \
--minDatasets 2 \
--o hepg2_pass_list.csv
talon_filter_transcripts \
--db talon.db \
-a gencode_v29 \
--datasets hffc6_1,hffc6_2,hffc6_3 \
--maxFracA 0.5 \
--minCount 5 \
--minDatasets 2 \
--o hffc6_pass_list.csv
cat hepg2_pass_list.csv > temp.csv
cat hffc6_pass_list.csv >> temp.csv
cat temp.csv | uniq > all_pass_list.csv
rm temp.csv
talon_create_GTF \
--db talon.db \
-b hg38 \
-a gencode_v29 \
--whitelist all_pass_list.csv \
--observed \
--o all
talon_abundance \
--db talon.db \
-a gencode_v29 \
-b hg38 \
--whitelist all_pass_list.csv \
--o all