Skip to content

Commit

Permalink
Add two new commands: cmbreps to combine two replicates; bdgopt to
Browse files Browse the repository at this point in the history
modify the score column of bedGraph file such as pscore to qscore
conversion.
  • Loading branch information
taoliu committed Jul 31, 2014
1 parent 470ccac commit 556e2d5
Show file tree
Hide file tree
Showing 13 changed files with 428 additions and 62 deletions.
105 changes: 100 additions & 5 deletions MACS2/IO/cBedGraph.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2014-03-03 13:56:01 Tao Liu>
# Time-stamp: <2014-07-30 23:10:44 Tao Liu>

"""Module for Feature IO classes.
Expand Down Expand Up @@ -42,6 +42,17 @@ __doc__ = "bedGraphTrackI class"
# ------------------------------------
# Misc functions
# ------------------------------------
LOG10_E = 0.43429448190325176

from libc.math cimport log1p, exp, log10

cpdef float fisher_method_combining_two_log10pvalues ( float p1, float p2 ):
#x = 2 * ( p1 + p2 ) / LOG10_E
#return -log10( exp( -x/2 ) * ( 1 + x/2 ) )
return ( p1 + p2 ) - log1p( ( p1 + p2 ) / LOG10_E ) * LOG10_E

cpdef float mean ( float p1, float p2 ):
return ( p1 + p2 ) / 2

# ------------------------------------
# Classes
Expand Down Expand Up @@ -700,7 +711,7 @@ cdef class bedGraphTrackI:
ret.add_loc(chrom,0,max_p,new_value)
return ret

def overlie (self, bdgTrack2, func=max ):
def overlie (self, bdgTrack2, func="max" ):
"""Calculate two bedGraphTrackI objects by letting self
overlying bdgTrack2, with user-defined functions.
Expand Down Expand Up @@ -738,6 +749,15 @@ cdef class bedGraphTrackI:

assert isinstance(bdgTrack2,bedGraphTrackI), "bdgTrack2 is not a bedGraphTrackI object"

if func == "max":
f = max
elif func == "mean":
f = lambda x, y: ( x + y ) / 2
elif func == "fisher":
f = lambda p1, p2: ( p1 + p2 ) - log1p( ( p1 + p2 ) / LOG10_E ) * LOG10_E
else:
raise Exception("Invalid function")

ret = bedGraphTrackI()
retadd = ret.add_loc

Expand Down Expand Up @@ -766,21 +786,21 @@ cdef class bedGraphTrackI:
while True:
if p1 < p2:
# clip a region from pre_p to p1, then set pre_p as p1.
retadd(chrom,pre_p,p1,func(v1,v2))
retadd(chrom,pre_p,p1,f(v1,v2))
pre_p = p1
# call for the next p1 and v1
p1 = p1n()
v1 = v1n()
elif p2 < p1:
# clip a region from pre_p to p2, then set pre_p as p2.
retadd(chrom,pre_p,p2,func(v1,v2))
retadd(chrom,pre_p,p2,f(v1,v2))
pre_p = p2
# call for the next p2 and v2
p2 = p2n()
v2 = v2n()
elif p1 == p2:
# from pre_p to p1 or p2, then set pre_p as p1 or p2.
retadd(chrom,pre_p,p1,func(v1,v2))
retadd(chrom,pre_p,p1,f(v1,v2))
pre_p = p1
# call for the next p1, v1, p2, v2.
p1 = p1n()
Expand All @@ -807,6 +827,81 @@ cdef class bedGraphTrackI:
self.minvalue = func(self.minvalue)
return True

def p2q ( self ):
"""Convert pvalue scores to qvalue scores.
*Assume scores in this bedGraph are pvalue scores! Not work
for other type of scores.
"""
cdef:
str chrom
object pos_array, pscore_array
dict pvalue_stat = {}
dict pqtable = {}
long n, pre_p, this_p, length, j, pre_l, l, i
double this_v, pre_v, v, q, pre_q, this_t, this_c
long N, k, this_l
double f
long nhcal = 0
long npcal = 0
list unique_values
double t0, t1, t

# calculate frequencies of each p-score
for chrom in self.__data.keys():
pre_p = 0

[pos_array, pscore_array] = self.__data[ chrom ]

pn = iter(pos_array).next
vn = iter(pscore_array).next

for i in range( len( pos_array ) ):
this_p = pn()
this_v = vn()
this_l = this_p - pre_p
if pvalue_stat.has_key( this_v ):
pvalue_stat[ this_v ] += this_l
else:
pvalue_stat[ this_v ] = this_l
pre_p = this_p

nhcal += len( pos_array )

nhval = 0

N = sum(pvalue_stat.values()) # total length
k = 1 # rank
f = -log10(N)
pre_v = -2147483647
pre_l = 0
pre_q = 2147483647 # save the previous q-value

# calculate qscore for each pscore
pqtable = {}
unique_values = sorted(pvalue_stat.keys(), reverse=True)
for i in range(len(unique_values)):
v = unique_values[i]
l = pvalue_stat[v]
q = v + (log10(k) + f)
q = max(0,min(pre_q,q)) # make q-score monotonic
pqtable[ v ] = q
pre_v = v
pre_q = q
k+=l
nhcal += 1

# convert pscore to qscore
for chrom in self.__data.keys():
[pos_array, pscore_array] = self.__data[ chrom ]

for i in range( len( pos_array ) ):
pscore_array[ i ] = pqtable[ pscore_array[ i ] ]

self.merge_regions()
return


def extract_value ( self, bdgTrack2 ):
"""It's like overlie function. THe overlapped regions between
bdgTrack2 and self, will be recorded. The values from self in
Expand Down
51 changes: 12 additions & 39 deletions MACS2/IO/cParser.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2014-06-06 10:24:14 Tao Liu>
# Time-stamp: <2014-07-29 23:55:08 Tao Liu>

"""Module for all MACS Parser classes for input.
Expand Down Expand Up @@ -158,7 +158,7 @@ cdef class GenericParser:
else:
self.fhd = io.open( filename, mode='rb' ) # binary mode! I don't expect unicode here!

def tsize( self ):
cpdef int tsize( self ):
"""General function to detect tag size.
* Although it can be used by most parsers, it must be
Expand Down Expand Up @@ -189,13 +189,13 @@ cdef class GenericParser:
self.tag_size = s/n
return self.tag_size

def __tlen_parse_line ( self, str thisline ):
cdef __tlen_parse_line ( self, str thisline ):
"""Abstract function to detect tag length.
"""
raise NotImplemented

def build_fwtrack ( self ):
cpdef build_fwtrack ( self ):
"""Generic function to build FWTrackIII object. Create a new
FWTrackIII object. If you want to append new records to an
existing FWTrackIII object, try append_fwtrack function.
Expand Down Expand Up @@ -228,7 +228,7 @@ cdef class GenericParser:
self.close()
return fwtrack

def append_fwtrack ( self, fwtrack ):
cpdef append_fwtrack ( self, fwtrack ):
"""Add more records to an existing FWTrackIII object.
"""
Expand All @@ -252,9 +252,7 @@ cdef class GenericParser:
self.close()
return fwtrack



def __fw_parse_line ( self, str thisline ):
cdef __fw_parse_line ( self, str thisline ):
"""Abstract function to parse chromosome, 5' end position and
strand.
Expand All @@ -264,7 +262,7 @@ cdef class GenericParser:
cdef int strand = -1
return ( chromosome, fpos, strand )

def sniff ( self ):
cpdef sniff ( self ):
"""Detect whether this parser is the correct parser for input
file.
Expand All @@ -288,7 +286,7 @@ cdef class GenericParser:
self.fhd.seek( 0 )
return True

def close ( self ):
cpdef close ( self ):
"""Run this when this Parser will be never used.
Close file I/O stream.
Expand All @@ -299,7 +297,7 @@ cdef class BEDParser( GenericParser ):
"""File Parser Class for tabular File.
"""
def __tlen_parse_line ( self, str thisline ):
cdef __tlen_parse_line ( self, str thisline ):
"""Parse 5' and 3' position, then calculate tag length.
"""
Expand All @@ -313,7 +311,7 @@ cdef class BEDParser( GenericParser ):
thisfields = thisline.split( '\t' )
return atoi( thisfields[ 2 ] )-atoi( thisfields[ 1 ] )

def __fw_parse_line ( self, str thisline ):
cdef __fw_parse_line ( self, str thisline ):
#cdef list thisfields
cdef char * chromname

Expand Down Expand Up @@ -349,12 +347,11 @@ cdef class BEDParser( GenericParser ):
atoi( thisfields[ 1 ] ),
0 )


cdef class ELANDResultParser( GenericParser ):
"""File Parser Class for tabular File.
"""
def __tlen_parse_line ( self, str thisline ):
cdef __tlen_parse_line ( self, str thisline ):
"""Parse tag sequence, then tag length.
"""
Expand All @@ -366,7 +363,7 @@ cdef class ELANDResultParser( GenericParser ):
else:
return len( thisfields[ 1 ] )

def __fw_parse_line ( self, str thisline ):
cdef __fw_parse_line ( self, str thisline ):
cdef:
str chromname, strand
int thistaglength
Expand Down Expand Up @@ -702,30 +699,6 @@ cdef class BAMParser( GenericParser ):
return False

cpdef int tsize ( self ):
if HAS_PYSAM:
return self.__tsize_w_pysam()
else:
return self.__tsize_wo_pysam()

cdef int __tsize_w_pysam( self ):
cdef:
int n = 0 # successful read of tag size
double s = 0 # sum of tag sizes

if self.tag_size != -1:
# if we have already calculated tag size (!= -1), return it.
return self.tag_size

for n in range( 1, 11, 1 ):
try:
s += self.fhd.next().qlen
except:
break
self.tag_size = int( s/n )
self.fhd.reset()
return self.tag_size

cdef int __tsize_wo_pysam( self ):
"""Get tag size from BAM file -- read l_seq field.
Refer to: http://samtools.sourceforge.net/SAM1.pdf
Expand Down
78 changes: 77 additions & 1 deletion MACS2/OptValidator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2014-06-17 01:39:03 Tao Liu>
# Time-stamp: <2014-07-30 23:05:15 Tao Liu>

"""Module Description
Expand Down Expand Up @@ -650,3 +650,79 @@ def opt_validate_bdgcmp ( options ):

return options


def opt_validate_cmbreps ( options ):
"""Validate options from a OptParser object.
Ret: Validated options object.
"""
# logging object
logging.basicConfig(level=20,
format='%(levelname)-5s @ %(asctime)s: %(message)s ',
datefmt='%a, %d %b %Y %H:%M:%S',
stream=sys.stderr,
filemode="w"
)

options.error = logging.critical # function alias
options.warn = logging.warning
options.debug = logging.debug
options.info = logging.info

# methods should be valid:


if options.method not in [ 'fisher', 'max', 'mean']:
logging.error( "Invalid method: %s" % options.method )
sys.exit( 1 )

if len( options.ifile ) != 2:
logging.error("Only support two replicates!")
sys.exit( 1 )

# # of -i must == # of -w

# if not options.weights:
# options.weights = [ 1.0 ] * len( options.ifile )

# if len( options.ifile ) != len( options.weights ):
# logging.error("Must provide same number of weights as number of input files.")
# sys.exit( 1 )

# if options.method == "fisher" and len( options.ifile ) > 3:
# logging.error("NOT IMPLEMENTED! Can't combine more than 3 replicates using Fisher's method.")
# sys.exit( 1 )

return options


def opt_validate_bdgopt ( options ):
"""Validate options from a OptParser object.
Ret: Validated options object.
"""
# logging object
logging.basicConfig(level=20,
format='%(levelname)-5s @ %(asctime)s: %(message)s ',
datefmt='%a, %d %b %Y %H:%M:%S',
stream=sys.stderr,
filemode="w"
)

options.error = logging.critical # function alias
options.warn = logging.warning
options.debug = logging.debug
options.info = logging.info

# methods should be valid:

if options.method.lower() not in [ 'multiply', 'add', 'p2q']:
logging.error( "Invalid method: %s" % options.method )
sys.exit( 1 )

if options.method.lower() in [ 'multiply', 'add' ] and not options.extparam:
logging.error( "Need EXTRAPARAM for method multiply or add!")
sys.exit( 1 )

return options

2 changes: 1 addition & 1 deletion MACS2/Poisson.pyx → MACS2/Statistics.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cdef extern from "cPoisson.h":
cdef extern from "cStatistics.h":
double log10_poisson_cdf ( unsigned int n, double lam, short lower )
double log10_poisson_cdf_P_large_lambda ( unsigned int k, double lbd )
double log10_poisson_cdf_Q_large_lambda ( unsigned int k, double lbd )
Expand Down
2 changes: 1 addition & 1 deletion MACS2/bdgcmp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Time-stamp: <2013-10-28 12:17:33 Tao Liu>
# Time-stamp: <2014-07-30 02:40:36 Tao Liu>

import sys
import os
Expand Down
Loading

0 comments on commit 556e2d5

Please sign in to comment.