Skip to content

Commit

Permalink
Merge pull request #4 from tariqdaouda/bloody
Browse files Browse the repository at this point in the history
Merge with bloody
  • Loading branch information
courcelm authored Jan 8, 2018
2 parents 812ae45 + da1acea commit dd65915
Show file tree
Hide file tree
Showing 23 changed files with 858 additions and 529 deletions.
25 changes: 25 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
sudo: false

notifications:
email: false

language: python

python:
- "2.7"

before_install:
- wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- conda update --yes conda

install:
- conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy
- pip install coverage
- pip install https://github.com/tariqdaouda/rabaDB/archive/stable.zip
- python setup.py install

script: coverage run -m unittest discover pyGeno/tests/

after_success: bash <(curl -s https://codecov.io/bash)
19 changes: 19 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
1.3.2
=====

* Search now uses KMD by default instead of dichotomic search (massive speed gain). Many thanks to @Keija for the implementation. Go to https://github.com/tariqdaouda/pyGeno/pull/34 for details and benchmarks.

1.3.1
=====

* AGN SNPs Quality cast to float by importer
* Travis integration
* Minor CSV parser updates

1.3.0
=====

* CSVFile will now ignore empty lines and comments

* Added synonymousCodonsFrequencies

1.2.9
=====

Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ pyGeno: A Python package for precision medicine and proteogenomics
:alt: pyGeno's logo


Even though more and more research focuses on Personalized/Precision Medicine, treatments that are specially tailored to the patient, pyGeno is (to our knowledge) the only tool available that will gladly build your specific genomes for you.
pyGeno is (to our knowledge) the only tool available that will gladly build your specific genomes for you.

pyGeno is developed by `Tariq Daouda`_ at the *Institute for Research in Immunology and Cancer* (IRIC_), its logo is the work of the freelance designer `Sawssan Kaddoura`_.
For the latest news about pyGeno, you can follow me on twitter `@tariqdaouda`_.

.. _Tariq Daouda: http://wwww.tariqdaouda.com
.. _IRIC: http://www.iric.ca
.. _Sawssan Kaddoura: http://www.sawssankaddoura.com
.. _Sawssan Kaddoura: http://sawssankaddoura.com

Click here for The `full documentation`_.

Expand Down
27 changes: 18 additions & 9 deletions pyGeno/Chromosome.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import copy
#import copy
#import types
#from tools import UsefulFunctions as uf

from types import *
import configuration as conf
from pyGenoObjectBases import *
Expand All @@ -10,11 +13,8 @@
import rabaDB.fields as rf

from tools.SecureMmap import SecureMmap as SecureMmap
from tools import UsefulFunctions as uf
from tools import SingletonManager

import types

import pyGeno.configuration as conf

class ChrosomeSequence(object) :
Expand All @@ -30,7 +30,7 @@ def __init__(self, data, chromosome, refOnly = False) :
def setSNPFilter(self, SNPFilter) :
self.SNPFilter = SNPFilter

def _getSequence(self, slic) :
def getSequenceData(self, slic) :
data = self.data[slic]
SNPTypes = self.chromosome.genome.SNPTypes

Expand Down Expand Up @@ -63,15 +63,21 @@ def _getSequence(self, slic) :
# print sequenceModifier.alleles
if sequenceModifier is not None :
if sequenceModifier.__class__ is SF.SequenceDel :
data = data[:seqPos] + data[seqPos + sequenceModifier.length:]
seqPos = seqPos + sequenceModifier.offset
#To avoid to change the length of the sequence who can create some bug or side effect
data[seqPos:(seqPos + sequenceModifier.length)] = [''] * sequenceModifier.length
elif sequenceModifier.__class__ is SF.SequenceSNP :
data[seqPos] = sequenceModifier.alleles
elif sequenceModifier.__class__ is SF.SequenceInsert :
data[seqPos] = "%s%s" % (sequenceModifier.bases, data[seqPos])
seqPos = seqPos + sequenceModifier.offset
data[seqPos] = "%s%s" % (data[seqPos], sequenceModifier.bases)
else :
raise TypeError("sequenceModifier on chromosome: %s starting at: %s is of unknown type: %s" % (self.chromosome.number, snp.start, sequenceModifier.__class__))
# print data
return ''.join(data)

return data

def _getSequence(self, slic) :
return ''.join(self.getSequenceData(slice(0, None, 1)))[slic]

def __getitem__(self, i) :
return self._getSequence(i)
Expand Down Expand Up @@ -116,6 +122,9 @@ def __init__(self, *args, **kwargs) :
self.refSequence = ChrosomeSequence(datMap, self, refOnly = True)
self.loadSequences = False

def getSequenceData(self, slic) :
return self.sequence.getSequenceData(slic)

def _makeLoadQuery(self, objectType, *args, **coolArgs) :
if issubclass(objectType, SNP_INDEL) :
f = RabaQuery(objectType, namespace = self._wrapped_class._raba_namespace)
Expand Down
36 changes: 18 additions & 18 deletions pyGeno/Exon.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class Exon(pyGenoRabaObjectWrapper) :

def __init__(self, *args, **kwargs) :
pyGenoRabaObjectWrapper.__init__(self, *args, **kwargs)
self._load_sequencesTriggers = set(["UTR5", "UTR3", "CDS", "sequence"])
self._load_sequencesTriggers = set(["UTR5", "UTR3", "CDS", "sequence", "data"])

def _makeLoadQuery(self, objectType, *args, **coolArgs) :
if issubclass(objectType, SNP_INDEL) :
Expand All @@ -74,35 +74,35 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) :

return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs)

def _load_sequences(self) :
seq = self.chromosome.sequence[self.start : self.end]
diffLen = (self.end-self.start) - len(seq)

# print "=---", len(seq), self.end-self.start
# print "=---", seq[-1], '...'
def _load_data(self) :
data = self.chromosome.getSequenceData(slice(self.start,self.end))

diffLen = (self.end-self.start) - len(data)

if self.strand == '+' :
self.sequence = seq
self.data = data
else :
self.sequence = uf.reverseComplement(str(seq))
self.data = uf.reverseComplementTab(data)

if self.hasCDS() :
start = self.CDS_start-self.start
end = self.CDS_end-self.start

if self.strand == '+' :
self.UTR5 = self.sequence[:start]
self.CDS = self.sequence[start:end+diffLen]
self.UTR3 = self.sequence[end+diffLen:]
self.UTR5 = self.data[:start]
self.CDS = self.data[start:end+diffLen]
self.UTR3 = self.data[end+diffLen:]
else :
self.UTR5 = self.sequence[:len(self.sequence)-(end-diffLen)]
self.CDS = self.sequence[len(self.sequence)-(end-diffLen):len(self.sequence)-start]
self.UTR3 = self.sequence[len(self.sequence)-start:]
self.UTR5 = self.data[:len(self.data)-(end-diffLen)]
self.CDS = self.data[len(self.data)-(end-diffLen):len(self.data)-start]
self.UTR3 = self.data[len(self.data)-start:]
else :
self.UTR5 = ''
self.CDS = ''
self.UTR3 = ''

self.sequence = ''.join(self.data)

def _load_bin_sequence(self) :
self.bin_sequence = NucBinarySequence(self.sequence)
self.bin_UTR5 = NucBinarySequence(self.UTR5)
Expand All @@ -123,15 +123,15 @@ def find(self, sequence) :
"""return the position of the first occurance of sequence"""
return self.bin_sequence.find(sequence)

def findAll(self, seqence):
def findAll(self, sequence):
"""Returns a lits of all positions where sequence was found"""
return self.bin_sequence.findAll(sequence)

def findInCDS(self, sequence) :
"""return the position of the first occurance of sequence"""
return self.bin_CDS.find(sequence)

def findAllInCDS(self, seqence):
def findAllInCDS(self, sequence):
"""Returns a lits of all positions where sequence was found"""
return self.bin_CDS.findAll(sequence)

Expand Down
30 changes: 28 additions & 2 deletions pyGeno/SNPFiltering.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,41 @@ def __init__(self, alleles, sources = {}) :
class SequenceInsert(Sequence_modifiers) :
"""Represents an Insertion to be applied to the sequence"""

def __init__(self, bases, sources = {}) :
def __init__(self, bases, sources = {}, ref = '-') :
Sequence_modifiers.__init__(self, sources)
self.bases = bases
self.offset = 0

# Allow to use format like C/CCTGGAA(dbSNP) or CCT/CCTGGAA(samtools)
if ref != '-':
if ref == bases[:len(ref)]:
self.offset = len(ref)
self.bases = self.bases[self.offset:]
#-1 because if the insertion are after the last nuc we go out of table
self.offset -= 1
else:
raise NotImplemented("This format of Insetion is not accepted. Please change your format, or implement your format in pyGeno.")


class SequenceDel(Sequence_modifiers) :
"""Represents a Deletion to be applied to the sequence"""
def __init__(self, length, sources = {}) :
def __init__(self, length, sources = {}, ref = None, alt = '-') :
Sequence_modifiers.__init__(self, sources)
self.length = length
self.offset = 0

# Allow to use format like CCTGGAA/C(dbSNP) or CCTGGAA/CCT(samtools)
if alt != '-':
if ref is not None:
if alt == ref[:len(alt)]:
self.offset = len(alt)
self.length = self.length - len(alt)
else:
raise NotImplemented("This format of Deletion is not accepted. Please change your format, or implement your format in pyGeno.")
else:
raise Exception("You need to add a ref sequence in your call of SequenceDel. Or implement your format in pyGeno.")



class SNPFilter(object) :
"""Abtract Class. All filters must inherit from me"""
Expand Down
33 changes: 21 additions & 12 deletions pyGeno/Transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class Transcript_Raba(pyGenoRabaObject) :
start = rf.Primitive()
end = rf.Primitive()
coding = rf.Primitive()
biotype = rf.Primitive()

genome = rf.RabaObject('Genome_Raba')
chromosome = rf.RabaObject('Chromosome_Raba')
Expand All @@ -34,7 +35,15 @@ def _curate(self) :
self.name = self.name.upper()

self.length = abs(self.end - self.start)
if self.exons[0].CDS_start is not None and self.exons[-1].CDS_end is not None :
have_CDS_start = False
have_CDS_end = False
for exon in self.exons :
if exon.CDS_start is not None :
have_CDS_start = True
if exon.CDS_end is not None :
have_CDS_end = True

if have_CDS_start and have_CDS_end :
self.coding = True
else :
self.coding = False
Expand All @@ -47,7 +56,7 @@ class Transcript(pyGenoRabaObjectWrapper) :
def __init__(self, *args, **kwargs) :
pyGenoRabaObjectWrapper.__init__(self, *args, **kwargs)
self.exons = RLWrapper(self, Exon, self.wrapped_object.exons)
self._load_sequencesTriggers = set(["UTR5", "UTR3", "cDNA", "sequence"])
self._load_sequencesTriggers = set(["UTR5", "UTR3", "cDNA", "sequence", "data"])
self.exonsDict = {}

def _makeLoadQuery(self, objectType, *args, **coolArgs) :
Expand All @@ -70,14 +79,14 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) :

return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs)

def _load_sequences(self) :
def _load_data(self) :
def getV(k) :
return pyGenoRabaObjectWrapper.__getattribute__(self, k)

def setV(k, v) :
return pyGenoRabaObjectWrapper.__setattr__(self, k, v)

sequence = []
self.data = []
cDNA = []
UTR5 = []
UTR3 = []
Expand All @@ -87,20 +96,20 @@ def setV(k, v) :
e = pyGenoRabaObjectWrapper_metaclass._wrappers[Exon_Raba](wrapped_object_and_bag = (ee, getV('bagKey')))
self.exonsDict[(e.start, e.end)] = e
exons.append(e)
sequence.append(e.sequence)

self.data.extend(e.data)
if e.hasCDS() :
UTR5.append(e.UTR5)
cDNA.append(e.CDS)
UTR3.append(e.UTR3)
UTR5.append(''.join(e.UTR5))
cDNA.append(''.join(e.CDS))
UTR3.append(''.join(e.UTR3))
prime5 = False
else :
if prime5 :
UTR5.append(e.sequence)
UTR5.append(''.join(e.data))
else :
UTR3.append(e.sequence)
UTR3.append(''.join(e.data))

sequence = ''.join(sequence)
sequence = ''.join(self.data)
cDNA = ''.join(cDNA)
UTR5 = ''.join(UTR5)
UTR3 = ''.join(UTR3)
Expand Down
Empty file.
4 changes: 2 additions & 2 deletions pyGeno/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ def pyGeno_init() :
os.makedirs(pyGeno_SETTINGS_DIR)

pyGeno_SETTINGS_PATH = getSettingsPath()
pyGeno_RABA_DBFILE = os.path.normpath('%s/pyGenoRaba.db' % pyGeno_SETTINGS_PATH)
pyGeno_DATA_PATH = os.path.normpath('%s/data' % pyGeno_SETTINGS_PATH)
pyGeno_RABA_DBFILE = os.path.normpath( os.path.join(pyGeno_SETTINGS_PATH, "pyGenoRaba.db") )
pyGeno_DATA_PATH = os.path.normpath( os.path.join(pyGeno_SETTINGS_PATH, "data") )

if not os.path.exists(pyGeno_SETTINGS_PATH) :
os.makedirs(pyGeno_SETTINGS_PATH)
Expand Down
9 changes: 4 additions & 5 deletions pyGeno/doc/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ pyGeno: A Python package for precision medicine and proteogenomics
.. image:: http://depsy.org/api/package/pypi/pyGeno/badge.svg
:alt: depsy
:target: http://depsy.org/package/python/pyGeno

.. image:: https://img.shields.io/pypi/dm/pyGeno.svg
:alt: downloads
:target: https://pypi.python.org/pypi/pyGeno
.. image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg
:target: https://opensource.org/licenses/Apache-2.0
.. image:: https://img.shields.io/badge/python-2.7-blue.svg

pyGeno's `lair is on Github`_.

Expand Down Expand Up @@ -114,8 +113,8 @@ Contents:
installation
bootstraping
querying
datawraps
importation
datawraps
objects
snp_filter
tools
Expand Down
Loading

0 comments on commit dd65915

Please sign in to comment.