I filtered out genes that were expressed <25 FPKM in all samples. This is to minimize statistical artifacts that can occur with borderline detected genes. First, however, parsing the average FPKM table:
#loading fpkm table
setwd("~/count.data")
data <- read.table("genes.fpkm_table-HDX+HWG.txt", header=TRUE, sep="\t") #read the FPKM table from cuffnorm
#creating average FPKM expression values (to be used elsewhere, e.g. heatmaps)
data.avg <- data.frame(data[1],
pdb.avg=rowMeans(data[2:4], na.rm = FALSE, dims = 1),
mm.avg=rowMeans(data[5:7], na.rm = FALSE, dims = 1),
mmq.avg=rowMeans(data[8:10], na.rm = FALSE, dims = 1),
ic24.avg=rowMeans(data[11:13], na.rm = FALSE, dims = 1),
scl.avg=rowMeans(data[14:16], na.rm = FALSE, dims = 1),
pda.frt.avg=rowMeans(data[17:19], na.rm = FALSE, dims = 1),
pda.nec.avg=rowMeans(data[20:22], na.rm = FALSE, dims = 1),
ath.frt.avg=rowMeans(data[23:25], na.rm = FALSE, dims = 1),
ath.nec.avg=rowMeans(data[26:28], na.rm = FALSE, dims = 1),
sly.frt.avg=rowMeans(data[29:31], na.rm = FALSE, dims = 1),
sly.nec.avg=rowMeans(data[32:34], na.rm = FALSE, dims = 1),
han.frt.avg=rowMeans(data[35:37], na.rm = FALSE, dims = 1),
han.nec.avg=rowMeans(data[38:40], na.rm = FALSE, dims = 1),
bvu.frt.avg=rowMeans(data[41:43], na.rm = FALSE, dims = 1),
bvu.nec.avg=rowMeans(data[44:46], na.rm = FALSE, dims = 1),
rco.frt.avg=rowMeans(data[47:49], na.rm = FALSE, dims = 1),
rco.nec.avg=rowMeans(data[50:52], na.rm = FALSE, dims = 1),
pvu.frt.avg=rowMeans(data[53:55], na.rm = FALSE, dims = 1),
pvu.nec.avg=rowMeans(data[56:58], na.rm = FALSE, dims = 1)
)
fpkm.avg <- data.avg
write.table(fpkm.avg, "genes-AVG.fpkm_table-HDX+HWG.txt", sep="\t", row.names = F, quote=FALSE) #write the table into a file
Now, the average FPKM values per sample can be used to filter out non-expressed genes.
maxv <- apply(fpkm.avg[,2:20],1,max)
minv <- apply(fpkm.avg[,2:20],1,min)
avg.maxv <- data.frame(fpkm.avg[1:20], minv, maxv) #this makes a table with minimum and maximum expression across samples
cond2 <- avg.maxv$maxv < 25
fpkm.avg.expr25 <- avg.maxv[!cond2,] #this removes rows with maximum expression value below 25
write.table(fpkm.avg.expr25, "genes-AVG.fpkm_table-HDX+HWG.expr25.txt", sep="\t", row.names = F, quote=FALSE) #write the table into a file
Next, to filter the read count table.
data.rawread <- read.table("counts.raw.HDX+HWG.tab", header=TRUE, sep="\t") #read the count table
counts.expr25 <- merge(fpkm.avg.expr25[1], data.rawread, by = "identifier",
all.x = TRUE, all.y = FALSE, sort = TRUE)
write.table(counts.expr25, "counts.raw.HDX+HWG_expr25.txt", sep="\t", row.names = F, quote=FALSE) #write the table into a file