Skip to content

Latest commit

 

History

History
47 lines (47 loc) · 2.7 KB

03.Filtering.md

File metadata and controls

47 lines (47 loc) · 2.7 KB

Filtering non-expressed genes

I filtered out genes that were expressed <25 FPKM in all samples. This is to minimize statistical artifacts that can occur with borderline detected genes. First, however, parsing the average FPKM table:

#loading fpkm table
setwd("~/count.data")
data <- read.table("genes.fpkm_table-HDX+HWG.txt", header=TRUE, sep="\t") #read the FPKM table from cuffnorm
#creating average FPKM expression values (to be used elsewhere, e.g. heatmaps)
data.avg <- data.frame(data[1],
  pdb.avg=rowMeans(data[2:4], na.rm = FALSE, dims = 1),
  mm.avg=rowMeans(data[5:7], na.rm = FALSE, dims = 1),
  mmq.avg=rowMeans(data[8:10], na.rm = FALSE, dims = 1),
  ic24.avg=rowMeans(data[11:13], na.rm = FALSE, dims = 1),
  scl.avg=rowMeans(data[14:16], na.rm = FALSE, dims = 1),
  pda.frt.avg=rowMeans(data[17:19], na.rm = FALSE, dims = 1),
  pda.nec.avg=rowMeans(data[20:22], na.rm = FALSE, dims = 1),
  ath.frt.avg=rowMeans(data[23:25], na.rm = FALSE, dims = 1),
  ath.nec.avg=rowMeans(data[26:28], na.rm = FALSE, dims = 1),
  sly.frt.avg=rowMeans(data[29:31], na.rm = FALSE, dims = 1),
  sly.nec.avg=rowMeans(data[32:34], na.rm = FALSE, dims = 1),
  han.frt.avg=rowMeans(data[35:37], na.rm = FALSE, dims = 1),
  han.nec.avg=rowMeans(data[38:40], na.rm = FALSE, dims = 1),
  bvu.frt.avg=rowMeans(data[41:43], na.rm = FALSE, dims = 1),
  bvu.nec.avg=rowMeans(data[44:46], na.rm = FALSE, dims = 1),
  rco.frt.avg=rowMeans(data[47:49], na.rm = FALSE, dims = 1),
  rco.nec.avg=rowMeans(data[50:52], na.rm = FALSE, dims = 1),
  pvu.frt.avg=rowMeans(data[53:55], na.rm = FALSE, dims = 1),
  pvu.nec.avg=rowMeans(data[56:58], na.rm = FALSE, dims = 1)
  )
fpkm.avg <- data.avg
write.table(fpkm.avg, "genes-AVG.fpkm_table-HDX+HWG.txt", sep="\t", row.names = F, quote=FALSE) #write the table into a file

Now, the average FPKM values per sample can be used to filter out non-expressed genes.

maxv <- apply(fpkm.avg[,2:20],1,max)
minv <- apply(fpkm.avg[,2:20],1,min)
avg.maxv <- data.frame(fpkm.avg[1:20], minv, maxv) #this makes a table with minimum and maximum expression across samples
cond2 <- avg.maxv$maxv < 25
fpkm.avg.expr25 <- avg.maxv[!cond2,] #this removes rows with maximum expression value below 25
write.table(fpkm.avg.expr25, "genes-AVG.fpkm_table-HDX+HWG.expr25.txt", sep="\t", row.names = F, quote=FALSE) #write the table into a file

Next, to filter the read count table.

data.rawread <- read.table("counts.raw.HDX+HWG.tab", header=TRUE, sep="\t") #read the count table
counts.expr25 <- merge(fpkm.avg.expr25[1], data.rawread, by = "identifier", 
	all.x = TRUE, all.y = FALSE, sort = TRUE)
write.table(counts.expr25, "counts.raw.HDX+HWG_expr25.txt", sep="\t", row.names = F, quote=FALSE) #write the table into a file