Skip to content

Commit

Permalink
Added the HomerTools script, made updates for printing out like I nee…
Browse files Browse the repository at this point in the history
…d it to
  • Loading branch information
aomdahl committed Mar 6, 2018
1 parent 547ae29 commit dec6562
Showing 1 changed file with 317 additions and 0 deletions.
317 changes: 317 additions & 0 deletions Toolkit_scripts/HomerTools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
#!/usr/bin/python
###########################################
#This is a simple tool to perform useful functions having to do with the HOMER motif recognition tool.
##########################################
import re
import os
import argparse

START = ">"

#An object for reading in and manipulating homer data, but most especially for creating
#custom homer files
#Each HomerData object represents a single motif.
class HomerData:
def __init__(self, seq):
self.seqName = seq
if seq[0] != START:
self.seqName = START + seq
self.name = "ID"
self.thresh = 8
self.matrix = list()

def write(self, openFile):
openFile.write(self.seqName + '\t' + self.name + '\t' + str(self.thresh) + '\n')
self.writeMatrix(openFile)

def writeMatrix(self, openFile):
for row in self.matrix:
for i in range(0, len(row)):
if i == (len(row)-1):
openFile.write(str(row[i]) + '\n')
else:
openFile.write(str(row[i]) + '\t')



###################Parse HTML from HOMER output################

#A super function that can perform the extraction on a series of directories and return printed output
def assembleMotifMatchResults(dir_path):
first_run = True
#iterate through each directory
for i in os.listdir(dir_path):
if os.path.isdir(dir_path + "/" + i): #If folder is a directory
#look for the homerResults directory
if os.path.isdir(dir_path + "/" + i + "/homerResults"):
#iterate through each relevant html file inside.
writeMotifMatchHTML(dir_path + "/" + i + "/homerResults", first_run, dirs_name =i)
first_run = False
return


#Helper function to write it out to a page.
def writeMotifMatchHTML(path, header = False, dirs_name = ""):
if header:
print("Dir_name\tComparison_seq\tReference_seq\tRank\tScore")
refMotifMap = dict()
#writeOut= open(outputFile, 'w')
#print(os.listdir(path))
for i in os.listdir(path):
if i.endswith(".info.html"):
motifMatchHTMLExtraction(path + "/" + i, printOut = True, dir_name = dirs_name)
#print "Output written to", outputFile
#writeOut.close()


#Main function: Extracts the list of matching motifs from the Homer output
#htmlFile- path to the file in
#@param outFile- the path to an output writing file if you want to do that.
#@return a list [seqeunce, list of each TFBS that matches that sequence]
def motifMatchHTMLExtraction(htmlFile, printOut = False, dir_name = ""):
#The keys that we want
refMotifDataList = list()
htmlFileStream = open(htmlFile, 'r')
currentLine = htmlFileStream.readline().strip()
#get the second line
currentLine = htmlFileStream.readline().strip()
#The Motif sequence is on the second line
seq = extractMotifSeqHTML(currentLine)

while currentLine != "":
refMotifDataList.append(findMotifDataCoreHTML(htmlFileStream))
currentLine = htmlFileStream.readline().strip()
htmlFileStream.close()

#Option to write these out.
if printOut:
print_str = ""
#print refMotifDataList
for subList in refMotifDataList:
print_str = dir_name + '\t'
if subList is not None:
print_str = print_str+ seq +'\t'
for element in subList:
if type(element) is str:
print_str = print_str + "\t"+ element
print(print_str)
return

return [seq, filter(None, refMotifDataList)] #filter out the leftover nonetype matches.


#Extracts the motif's sequence from the HTML file
#Useful as a key
def extractMotifSeqHTML(currLine):
if "<H2>" in currLine:
return re.findall("([A-Z\(\)]+)\s", currLine)[0]

#Aids motifMatchHTMLExtraction: actually parses for the information from the page that we care about
#@param fileStream:
#@param the motif sequence
#@return a list containing the [name, rank, match_score]
def findMotifDataCoreHTML(fileStream):
regexOptions= ['<H4>\d+(M[\w\d\._-]+)<\/H4>', '<H4>\d+([\w\d\._-]+)<\/H4>','<H4>\d*([\w\d\._-]+)<\/H4>']
ID_HEADER = "<H4>"
ID = "MotifID"
MATCH_RANK = "Match Rank:"
SCORE = "Score:"
OFFSET = "Offset:"
motifData = list()
currentLine = fileStream.readline().strip()
while currentLine != "":
if ID_HEADER in currentLine:
header = re.search("<H4>\d+-([\w\d\._-]+)<\/H4>", currentLine)
regexOption = 0
#Make sure we get a valid response
while header is None:
header = re.search(regexOptions[regexOption], currentLine)
regexOption += 1
motifData.append(header.group(1))
elif MATCH_RANK in currentLine:
motifData.append(extractHTMLValue(currentLine, MATCH_RANK))
elif SCORE in currentLine:
motifData.append(extractHTMLValue(currentLine, SCORE))
elif OFFSET in currentLine: #After we get all the data we want
if len(motifData) != 3:
print ("ERROR- didn't get everything")
return motifData
else:
pass
currentLine = fileStream.readline().strip()


#A specific tool to extract HTML values from the homer output html
#Specific to this
#@param currentLine of the HTML file
#@param key to look for and excise
#@return the value with that assignment
def extractHTMLValue(currentLine, key, group1 = False):
regex = "<TD>([\w\s\.:]+)<\/TD"
regexResult = regexExtraction(currentLine,regex, groupNum = 2)
if regexResult is not None:
return regexResult
else:
return ""



#Simple function to get the name of a motif using regular expressions
#String to search,
#@param regular expression to search with
def regexExtraction(strIn, regex, groupNum = 1):
regex = re.findall(regex, strIn)
if regex is not None:
return regex[groupNum-1]
else:
return None


#####################################Creating custom HOMER files############################################

#The core function: itertes through a list of HOMER objects and writes them out in the appropriate format
#@param filePath: the file path you want to write to
#@param the list of homer objects
def writeHomerDB(filePath, homerList):
HOMERDB = open(filePath, 'w')
for item in homerList:
item.write(HOMERDB)
HOMERDB.close()

#A custom tool made for converting the downloadable library from PlantPan into a .motifs HOMER compatible database
#@param inFilePath- the PlantPan databse file
#@param outFilePath- where you wish to write the new HOMER compatible library to
#@customThresh- the logOddsDetection threshold required for HOMER files. For a Db, this value can be anywhere between
##5 and 10 with little effect on outcome
def convertPPToHomerLibrary(inFilePath, outFilePath, customThresh = 8):
PWSDB = open(inFilePath, 'r')
motifList = list()
currentLine = PWSDB.readline().strip()
lineCount = 1
regex = "\[([\w\t\.\^-]+)\]"
while currentLine != "":
if currentLine[0] == START:
#New
currHomer = HomerData(currentLine)
#Get the next four lines
a_line = extractMatrixLineValues(PWSDB.readline().strip(), regex)
c_line = extractMatrixLineValues(PWSDB.readline().strip(), regex)
g_line = extractMatrixLineValues(PWSDB.readline().strip(), regex)
t_line = extractMatrixLineValues(PWSDB.readline().strip(), regex)
currHomer.matrix = transformMatrix([a_line, c_line, g_line, t_line])
currHomer.thresh = customThresh
currHomer.name = str(lineCount) + "-" + currentLine[1:]
motifList.append(currHomer)
currentLine = PWSDB.readline().strip()
lineCount +=1
writeHomerDB(outFilePath, motifList)
print ("Converted PlantPan database to HOMER format! Check results.")

#Converts CIS-BP format PWMs into a HOMER-compatible .motifs file
#@param inPath- the path to a directory containing all of the PWMs you wish to include in the HOMER file
##Note that this is the standard CIS-BP download format available on line
#@param outfilePath- where you would like to write the final .motifs database file to
#@param customThresh- the logOddsDetection threshold required for HOMER files. For a Db, this value can be anywhere between
##5 and 10 with little effect on outcome
def convertCISPBtoHomerLibrary(inPath, outFilePath, customThresh=8):
motifList = list()
regex = "\d\t([\w\t\.\^-]+)"
counter = 0
for i in os.listdir(inPath):
currFile = open(inPath + "/" + i, 'r')
currentLine = currFile.readline().strip()
currMotifData = list()
idName = regexExtraction(i, '(M\d{4}_\d\.\d\d).txt')
currHomer = HomerData(idName)
while currentLine != "":
if "Pos" not in currentLine:
currMotifData.append(extractMatrixLineValues(currentLine,regex))
currentLine = currFile.readline().strip()
#print "Finished file:", idName
if len(currMotifData) == 0:
continue
currHomer.matrix = currMotifData
currHomer.thresh= customThresh
currHomer.name = str(counter) + idName
currFile.close()
motifList.append(currHomer)
counter += 1
writeHomerDB(outFilePath, motifList)
print ("Converted CISBP database to HOMER format! Check results.")




#Simple a helper tool for safely extracting lines from a file
#@return a list containing each line in the file
#@param fileStream to extract from
def safeLineExtraction(fileStream, lineCount = 4):
lineList = list()
for i in range(0, lineCount):
currLine = fileStream.readline().strip()
if currLine == "":
print ("Empty line found")
else:
lineList.append(currLine)
if len(lineList) == lineCount:
return lineList

###########################################Matrix Operations#####################
#Takes a matrix- list of lists- and transforms it into HOMER accepted format
def transformMatrix(matrix):
if matrix is None:
print ("Matrix is of noneType")
return list()

retMatrix = list()
for i in range(0, len(matrix[0])):
currVector = list()
for j in range(0, len(matrix)):
currVector.append(matrix[j][i])
retMatrix.append(currVector)
return retMatrix

#Uses a custom regular expression to extract the matrix data from a given line
#@param line to search from
#@searchRegex to search using, can't be easily customized :(
#@delim: specify the delimiter between matrix values, default is tab
#@return:
def extractMatrixLineValues(line, searchRegex, delim = '\t'):
dataExtract = re.search(searchRegex, str(line))
if dataExtract:
return dataExtract.group(1).strip().split(delim)
else:
print ("No data located", line)
return list()


###########################################################################################

################################Main function
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='HOMER tool with multiple functions. htmlParser parses HOMER html output from compareMotifs.pl. libConversion takes a database and converts it to .motifs file')
parser.add_argument("-hs", "--htmlSource", help="Specify the path to the homer output of html files.")
parser.add_argument("--dirs", help = "Specify a directory of directories to be printed out")
parser.add_argument("-o", "--output", default = "TF_match.tsv", help="Specify the comparison output file, default is \"TF_match.tsv\"")
parser.add_argument("--htmlParser", action = "store_true", help = "Specify this if you want to use the html parsing tool and print an output for one dir only")
parser.add_argument("-l", "--libConversion",choices=["PP", "CISBP"], help = "Specify the type of library you wish to convert")
parser.add_argument("-lp", "--libPath", help = "Specify the path to the PP or CISBP library you wish to convert")
parser.add_argument("-log", "--logOddsDetection", type=int, default = 8, help = "Specify a log odds detection threshold to useif you are doing a library conversion. Should be between 5.0 and 10.0")
args = parser.parse_args()

if(args.libConversion == "PP"):
if not args.libPath:
convertPPToHomerLibrary("/home/likewise-open/ICE/aomdahl/Datasets/PlantPan/Transcription_factor_weight_matrix.txt", args.output, args.logOddsDetection)
else:
convertPPToHomerLibrary(args.libPath, args.output, args.logOddsDetection)
if (args.libConversion == "CISBP"):
if not args.libPath:
convertCISPBtoHomerLibrary("/home/likewise-open/ICE/aomdahl/Datasets/CIS-BP/PWMs/pwms", args.output, args.logOddsDetection)
else:
convertCISPBtoHomerLibrary(args.libPath, args.output, args.logOddsDetection)

if(args.dirs):
assembleMotifMatchResults(args.dirs)

if(args.htmlParser):
writeMotifMatchHTML(args.htmlSource)

0 comments on commit dec6562

Please sign in to comment.