Update rose2 so it's compatible with linlabcode/pipeline

linlabbcm · Oct 4, 2019 · 7f163aa · 7f163aa
1 parent 2337413
commit 7f163aa
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 41 deletions.
diff --git a/rose2/__init__.py b/rose2/__init__.py
@@ -497,7 +497,6 @@ def main():
     print('MAKING START DICT')
     startDict = utils.makeStartDict(annotFile)
 
-
     #GET CHROMS FOUND IN THE BAMS
     print('GETTING CHROMS IN BAMFILES')
     bamChromList = getBamChromList(bamFileList)
@@ -511,23 +510,25 @@ def main():
     print('LOADING IN GFF REGIONS')
     referenceCollection = utils.gffToLocusCollection(inputGFF)
 
+    print('STARTING WITH {} INPUT REGIONS'.format(len(referenceCollection)))
     print('CHECKING REFERENCE COLLECTION:')
     checkRefCollection(referenceCollection)
-
 
     # MASKING REFERENCE COLLECTION
     # see if there's a mask
     if options.mask:
         maskFile = options.mask
+        print('USING MASK FILE {}'.format(maskFile))
         # if it's a bed file
         if maskFile.split('.')[-1].upper() == 'BED':
             maskGFF = utils.bedToGFF(maskFile)
         elif maskFile.split('.')[-1].upper() == 'GFF':
             maskGFF = utils.parseTable(maskFile, '\t')
         else:
             print("MASK MUST BE A .gff or .bed FILE")
-            sys.exit()
+
         maskCollection = utils.gffToLocusCollection(maskGFF)
+        print('LOADING {} MASK REGIONS'.format(len(maskCollection)))
 
         # now mask the reference loci
         referenceLoci = referenceCollection.getLoci()

diff --git a/rose2/genemapper.py b/rose2/genemapper.py
@@ -105,7 +105,15 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True
 
     if len(transcribedFile) > 0:
         transcribedTable = utils.parseTable(transcribedFile,'\t')
-        transcribedGenes = [line[1] for line in transcribedTable]
+        #figure out which column has refseq identifiers
+        line = transcribedTable[0]
+        ref_col = 0
+        if len(line) > 1:
+            for i in range(len(line)):
+
+                if line[i].count('NM_') >0 or line[i].count('NR_') >0:
+                    ref_col = i
+        transcribedGenes = [line[ref_col] for line in transcribedTable]
     else:
         transcribedGenes = startDict.keys()
 
@@ -134,25 +142,35 @@ def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True
     #list of all genes that appear in this analysis
     overallGeneList = []
 
+    # find the header
+    for line in enhancerTable:
+        if line[0][0] != '#':
+            header = line
+            print('this is the header')
+            print(header)
+            break
+
     if noFormatTable:
-        #set up the output tables
-        #first by enhancer
-        enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']]
+        # set up the output tables
+        # first by enhancer
+        enhancerToGeneTable = [header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]
 
-
     else:
-        #set up the output tables
-        #first by enhancer
-        enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]]
-
-        #next by gene
-        geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']]
-
-    #next make the gene to enhancer table
-    geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']]
-
-
+                # set up the output tables
+        # first by enhancer
+        enhancerToGeneTable = [
+            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]
+        ]
 
+    # next make the gene to enhancer table
+    geneToEnhancerTable = [[
+        'GENE_NAME',
+        'REFSEQ_ID',
+        'PROXIMAL_ENHANCERS',
+        'ENHANCER_RANKS',
+        'IS_SUPER',
+        'ENHANCER_SIGNAL',
+    ]]
 
     for line in enhancerTable:
         if line[0][0] =='#' or line[0][0] == 'R':
@@ -355,33 +373,35 @@ def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhan
     # list of all genes that appear in this analysis
     overallGeneList = []
 
-    # find the damn header
+    # find the header
     for line in enhancerTable:
-        if line[0][0] == '#':
-            continue
-        else:
+         if line[0][0] != '#':
             header = line
+            print('this is the header')
+            print(header)
             break
 
     if noFormatTable:
         # set up the output tables
         # first by enhancer
-        enhancerToGeneTable = [
-            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]
+        enhancerToGeneTable = [header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]
 
     else:
         # set up the output tables
         # first by enhancer
         enhancerToGeneTable = [
-            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]
-
-        # next by gene
-        geneToEnhancerTable = [
-            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]
+            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]
+        ]
 
     # next make the gene to enhancer table
-    geneToEnhancerTable = [
-        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]
+    geneToEnhancerTable = [[
+        'GENE_NAME',
+        'REFSEQ_ID',
+        'PROXIMAL_ENHANCERS',
+        'ENHANCER_RANKS',
+        'IS_SUPER',
+        'TSS_SIGNAL',
+    ]]
 
     for line in enhancerTable:
         if line[0][0] == '#' or line[0][0] == 'R':

diff --git a/rose2/utils.py b/rose2/utils.py
@@ -204,19 +204,38 @@ def bedToGFF(bed, output=''):
 
     gff = []
 
-    for line in bed:
+    #determine if this is a long bed or a short bed
 
-        gffLine = [line[0],line[3],'',line[1],line[2],line[4],line[5],'',line[3]]
-        gff.append(gffLine)
+    print(len(bed[0]))
+    print(bed[0])
 
+    if len(bed[0]) == 6: # this is a full format bed
+        bed_style = 'long'
+    elif len(bed[0]) == 5: # this is the medium  length bed with strand
+        bed_style = 'medium'
+    elif len(bed[0]) == 3: # this is the minimum length bed
+        bed_style = 'short'
+    else:
+        print('this is probably not actually a bed')
+        print(bed[0])
+        print('exiting now because the bed is sad')
+        sys.exit()
+    print('this bed has %s columns and is a %s bed' % (len(bed[0]),bed_style))
+    for line in bed:
+        if bed_style == 'long':
+            gffLine = [line[0],line[3],'',line[1],line[2],line[4],line[5],'',line[3]]
+        if bed_style == 'medium':
+            gffLine = [line[0],'','',line[1],line[2],'',line[4],'','']
+        if bed_style == 'short':
+            gffLine = [line[0],'','',line[1],line[2],'','.','','']
+        gff.append(gffLine)
 
     if len(output) > 0:
         unParseTable(gff,output,'\t')
     else:
         return gff
 
 
-
 #100912
 #gffToBed
 
@@ -299,7 +318,7 @@ def getParentFolder(inputFile):
     returns the parent folder for any file
     '''
 
-    parentFolder = join(inputFile.split('/')[:-1],'/') +'/'
+    parentFolder = '/'.join(inputFile.split('/')[:-1]) + '/'
     if parentFolder =='':
         return './'
     else:
@@ -663,9 +682,9 @@ def getConservation(self,phastConFolder):
                 phastBases += lineLen
 
         if phastBases > self.len():
-            print "this locus is sad %s. please debug me" % (self.__str__())
-            print "locus length is %s" % (self.len())
-            print "phastBases are %s" % (phastBases)
+            print("this locus is sad {}. please debug me".format(self.__str__()))
+            print("locus length is {}".format(self.len()))
+            print("phastBases are {}".format(phastBases))
 
 
         return phastSum/self.len()
@@ -1373,7 +1392,7 @@ def gffToFasta(genome,directory,gff,UCSC = True,useID=False):
         if useID:
             name = '>' + line[1]
         else:
-            name = '>'+ join([genome.lower(),line[0],str(line[3]),str(line[4]),line[6]],'|')
+            name = '>' + '|'.join([genome.lower(),line[0],str(line[3]),str(line[4]),line[6]])
         fastaList.append(name)
         if line[6] == '-':
             #print(line[3])