Skip to content

Commit

Permalink
Update rose2 so it's compatible with linlabcode/pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
jvrakor committed Oct 4, 2019
1 parent 2337413 commit 7f163aa
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 41 deletions.
7 changes: 4 additions & 3 deletions rose2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,6 @@ def main():
print('MAKING START DICT')
startDict = utils.makeStartDict(annotFile)


#GET CHROMS FOUND IN THE BAMS
print('GETTING CHROMS IN BAMFILES')
bamChromList = getBamChromList(bamFileList)
Expand All @@ -511,23 +510,25 @@ def main():
print('LOADING IN GFF REGIONS')
referenceCollection = utils.gffToLocusCollection(inputGFF)

print('STARTING WITH {} INPUT REGIONS'.format(len(referenceCollection)))
print('CHECKING REFERENCE COLLECTION:')
checkRefCollection(referenceCollection)


# MASKING REFERENCE COLLECTION
# see if there's a mask
if options.mask:
maskFile = options.mask
print('USING MASK FILE {}'.format(maskFile))
# if it's a bed file
if maskFile.split('.')[-1].upper() == 'BED':
maskGFF = utils.bedToGFF(maskFile)
elif maskFile.split('.')[-1].upper() == 'GFF':
maskGFF = utils.parseTable(maskFile, '\t')
else:
print("MASK MUST BE A .gff or .bed FILE")
sys.exit()

maskCollection = utils.gffToLocusCollection(maskGFF)
print('LOADING {} MASK REGIONS'.format(len(maskCollection)))

# now mask the reference loci
referenceLoci = referenceCollection.getLoci()
Expand Down
78 changes: 49 additions & 29 deletions rose2/genemapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,15 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True

if len(transcribedFile) > 0:
transcribedTable = utils.parseTable(transcribedFile,'\t')
transcribedGenes = [line[1] for line in transcribedTable]
#figure out which column has refseq identifiers
line = transcribedTable[0]
ref_col = 0
if len(line) > 1:
for i in range(len(line)):

if line[i].count('NM_') >0 or line[i].count('NR_') >0:
ref_col = i
transcribedGenes = [line[ref_col] for line in transcribedTable]
else:
transcribedGenes = startDict.keys()

Expand Down Expand Up @@ -134,25 +142,35 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True
#list of all genes that appear in this analysis
overallGeneList = []

# find the header
for line in enhancerTable:
if line[0][0] != '#':
header = line
print('this is the header')
print(header)
break

if noFormatTable:
#set up the output tables
#first by enhancer
enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']]
# set up the output tables
# first by enhancer
enhancerToGeneTable = [header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]


else:
#set up the output tables
#first by enhancer
enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]]

#next by gene
geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']]

#next make the gene to enhancer table
geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']]


# set up the output tables
# first by enhancer
enhancerToGeneTable = [
header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]
]

# next make the gene to enhancer table
geneToEnhancerTable = [[
'GENE_NAME',
'REFSEQ_ID',
'PROXIMAL_ENHANCERS',
'ENHANCER_RANKS',
'IS_SUPER',
'ENHANCER_SIGNAL',
]]

for line in enhancerTable:
if line[0][0] =='#' or line[0][0] == 'R':
Expand Down Expand Up @@ -355,33 +373,35 @@ def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhan
# list of all genes that appear in this analysis
overallGeneList = []

# find the damn header
# find the header
for line in enhancerTable:
if line[0][0] == '#':
continue
else:
if line[0][0] != '#':
header = line
print('this is the header')
print(header)
break

if noFormatTable:
# set up the output tables
# first by enhancer
enhancerToGeneTable = [
header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]
enhancerToGeneTable = [header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

else:
# set up the output tables
# first by enhancer
enhancerToGeneTable = [
header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

# next by gene
geneToEnhancerTable = [
['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]
header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]
]

# next make the gene to enhancer table
geneToEnhancerTable = [
['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]
geneToEnhancerTable = [[
'GENE_NAME',
'REFSEQ_ID',
'PROXIMAL_ENHANCERS',
'ENHANCER_RANKS',
'IS_SUPER',
'TSS_SIGNAL',
]]

for line in enhancerTable:
if line[0][0] == '#' or line[0][0] == 'R':
Expand Down
37 changes: 28 additions & 9 deletions rose2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,19 +204,38 @@ def bedToGFF(bed, output=''):

gff = []

for line in bed:
#determine if this is a long bed or a short bed

gffLine = [line[0],line[3],'',line[1],line[2],line[4],line[5],'',line[3]]
gff.append(gffLine)
print(len(bed[0]))
print(bed[0])

if len(bed[0]) == 6: # this is a full format bed
bed_style = 'long'
elif len(bed[0]) == 5: # this is the medium length bed with strand
bed_style = 'medium'
elif len(bed[0]) == 3: # this is the minimum length bed
bed_style = 'short'
else:
print('this is probably not actually a bed')
print(bed[0])
print('exiting now because the bed is sad')
sys.exit()
print('this bed has %s columns and is a %s bed' % (len(bed[0]),bed_style))
for line in bed:
if bed_style == 'long':
gffLine = [line[0],line[3],'',line[1],line[2],line[4],line[5],'',line[3]]
if bed_style == 'medium':
gffLine = [line[0],'','',line[1],line[2],'',line[4],'','']
if bed_style == 'short':
gffLine = [line[0],'','',line[1],line[2],'','.','','']
gff.append(gffLine)

if len(output) > 0:
unParseTable(gff,output,'\t')
else:
return gff



#100912
#gffToBed

Expand Down Expand Up @@ -299,7 +318,7 @@ def getParentFolder(inputFile):
returns the parent folder for any file
'''

parentFolder = join(inputFile.split('/')[:-1],'/') +'/'
parentFolder = '/'.join(inputFile.split('/')[:-1]) + '/'
if parentFolder =='':
return './'
else:
Expand Down Expand Up @@ -663,9 +682,9 @@ def getConservation(self,phastConFolder):
phastBases += lineLen

if phastBases > self.len():
print "this locus is sad %s. please debug me" % (self.__str__())
print "locus length is %s" % (self.len())
print "phastBases are %s" % (phastBases)
print("this locus is sad {}. please debug me".format(self.__str__()))
print("locus length is {}".format(self.len()))
print("phastBases are {}".format(phastBases))


return phastSum/self.len()
Expand Down Expand Up @@ -1373,7 +1392,7 @@ def gffToFasta(genome,directory,gff,UCSC = True,useID=False):
if useID:
name = '>' + line[1]
else:
name = '>'+ join([genome.lower(),line[0],str(line[3]),str(line[4]),line[6]],'|')
name = '>' + '|'.join([genome.lower(),line[0],str(line[3]),str(line[4]),line[6]])
fastaList.append(name)
if line[6] == '-':
#print(line[3])
Expand Down

0 comments on commit 7f163aa

Please sign in to comment.