Merge pull request #4 from tariqdaouda/bloody

Merge with bloody
tariqdaouda · Jan 8, 2018 · dd65915 · dd65915
2 parents 812ae45 + da1acea
commit dd65915
Show file tree

Hide file tree

Showing 23 changed files with 858 additions and 529 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,25 @@
+sudo: false
+
+notifications:
+    email: false
+
+language: python
+
+python:
+  - "2.7"
+
+before_install:
+    - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+    - bash miniconda.sh -b -p $HOME/miniconda
+    - export PATH="$HOME/miniconda/bin:$PATH"
+    - conda update --yes conda
+
+install:
+    - conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy
+    - pip install coverage
+    - pip install https://github.com/tariqdaouda/rabaDB/archive/stable.zip
+    - python setup.py install
+
+script: coverage run -m unittest discover pyGeno/tests/
+
+after_success: bash <(curl -s https://codecov.io/bash)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,3 +1,22 @@
+1.3.2
+=====
+
+* Search now uses KMD by default instead of dichotomic search (massive speed gain). Many thanks to @Keija for the implementation. Go to https://github.com/tariqdaouda/pyGeno/pull/34 for details and benchmarks.
+
+1.3.1
+=====
+
+* AGN SNPs Quality cast to float by importer
+* Travis integration
+* Minor CSV parser updates
+
+1.3.0
+=====
+
+* CSVFile will now ignore empty lines and comments
+
+* Added synonymousCodonsFrequencies
+
 1.2.9
 =====
 

diff --git a/README.rst b/README.rst
@@ -9,14 +9,14 @@ pyGeno: A Python package for precision medicine and proteogenomics
    :alt: pyGeno's logo
 
 
-Even though more and more research focuses on Personalized/Precision Medicine, treatments that are specially tailored to the patient, pyGeno is (to our knowledge) the only tool available that will gladly build your specific genomes for you.
+pyGeno is (to our knowledge) the only tool available that will gladly build your specific genomes for you.
 
 pyGeno is developed by `Tariq Daouda`_ at the *Institute for Research in Immunology and Cancer* (IRIC_), its logo is the work of the freelance designer `Sawssan Kaddoura`_.
 For the latest news about pyGeno, you can follow me on twitter `@tariqdaouda`_.
 
 .. _Tariq Daouda: http://wwww.tariqdaouda.com
 .. _IRIC: http://www.iric.ca
-.. _Sawssan Kaddoura: http://www.sawssankaddoura.com
+.. _Sawssan Kaddoura: http://sawssankaddoura.com
 
 Click here for The `full documentation`_.
 

diff --git a/pyGeno/Chromosome.py b/pyGeno/Chromosome.py
@@ -1,4 +1,7 @@
-import copy
+#import copy
+#import types
+#from tools import UsefulFunctions as uf
+
 from types import *
 import configuration as conf
 from pyGenoObjectBases import *
@@ -10,11 +13,8 @@
 import rabaDB.fields as rf
 
 from tools.SecureMmap import SecureMmap as SecureMmap
-from tools import UsefulFunctions as uf
 from tools import SingletonManager
 
-import types
-
 import pyGeno.configuration as conf
 
 class ChrosomeSequence(object) :
@@ -30,7 +30,7 @@ def __init__(self, data, chromosome, refOnly = False) :
 	def setSNPFilter(self, SNPFilter) :
 		self.SNPFilter = SNPFilter
 
-	def _getSequence(self, slic) :
+	def getSequenceData(self, slic) :
 		data = self.data[slic]
 		SNPTypes = self.chromosome.genome.SNPTypes
 
@@ -63,15 +63,21 @@ def _getSequence(self, slic) :
 			# print sequenceModifier.alleles
 			if sequenceModifier is not None :
 				if sequenceModifier.__class__ is SF.SequenceDel :
-					data = data[:seqPos] + data[seqPos + sequenceModifier.length:]
+					seqPos = seqPos + sequenceModifier.offset
+					#To avoid to change the length of the sequence who can create some bug or side effect
+					data[seqPos:(seqPos + sequenceModifier.length)] = [''] * sequenceModifier.length
 				elif sequenceModifier.__class__ is SF.SequenceSNP :
 					data[seqPos] = sequenceModifier.alleles
 				elif sequenceModifier.__class__ is SF.SequenceInsert :
-					data[seqPos] = "%s%s" % (sequenceModifier.bases, data[seqPos])
+					seqPos = seqPos + sequenceModifier.offset
+					data[seqPos] = "%s%s" % (data[seqPos], sequenceModifier.bases)
 				else :
 					raise TypeError("sequenceModifier on chromosome: %s starting at: %s is of unknown type: %s" % (self.chromosome.number, snp.start, sequenceModifier.__class__))
-		# print data
-		return ''.join(data)
+
+		return data
+
+	def _getSequence(self, slic) :
+		return ''.join(self.getSequenceData(slice(0, None, 1)))[slic]
 
 	def __getitem__(self, i) :
 		return self._getSequence(i)
@@ -116,6 +122,9 @@ def __init__(self, *args, **kwargs) :
 		self.refSequence = ChrosomeSequence(datMap, self, refOnly = True)
 		self.loadSequences = False
 
+	def getSequenceData(self, slic) :
+		return self.sequence.getSequenceData(slic)
+
 	def _makeLoadQuery(self, objectType, *args, **coolArgs) :
 		if issubclass(objectType, SNP_INDEL) :
 			f = RabaQuery(objectType, namespace = self._wrapped_class._raba_namespace)

diff --git a/pyGeno/Exon.py b/pyGeno/Exon.py
@@ -53,7 +53,7 @@ class Exon(pyGenoRabaObjectWrapper) :
 
 	def __init__(self, *args, **kwargs) :
 		pyGenoRabaObjectWrapper.__init__(self, *args, **kwargs)
-		self._load_sequencesTriggers = set(["UTR5", "UTR3", "CDS", "sequence"])
+		self._load_sequencesTriggers = set(["UTR5", "UTR3", "CDS", "sequence", "data"])
 
 	def _makeLoadQuery(self, objectType, *args, **coolArgs) :
 		if issubclass(objectType, SNP_INDEL) :
@@ -74,35 +74,35 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) :
 
 		return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs)
 
-	def _load_sequences(self) :
-		seq = self.chromosome.sequence[self.start : self.end]
-		diffLen = (self.end-self.start) - len(seq)
-
-		# print "=---", len(seq), self.end-self.start
-		# print "=---", seq[-1], '...'
+	def _load_data(self) :
+		data = self.chromosome.getSequenceData(slice(self.start,self.end))
+
+		diffLen = (self.end-self.start) - len(data)
 
 		if self.strand == '+' :
-			self.sequence = seq
+			self.data = data
 		else :
-			self.sequence =  uf.reverseComplement(str(seq))
-		
+			self.data = uf.reverseComplementTab(data)
+
 		if self.hasCDS() :
 			start = self.CDS_start-self.start
 			end = self.CDS_end-self.start
 
 			if self.strand == '+' :
-				self.UTR5 = self.sequence[:start]
-				self.CDS = self.sequence[start:end+diffLen]
-				self.UTR3 = self.sequence[end+diffLen:]
+				self.UTR5 = self.data[:start]
+				self.CDS = self.data[start:end+diffLen]
+				self.UTR3 = self.data[end+diffLen:]
 			else :
-				self.UTR5 = self.sequence[:len(self.sequence)-(end-diffLen)]
-				self.CDS = self.sequence[len(self.sequence)-(end-diffLen):len(self.sequence)-start]
-				self.UTR3 = self.sequence[len(self.sequence)-start:]
+				self.UTR5 = self.data[:len(self.data)-(end-diffLen)]
+				self.CDS = self.data[len(self.data)-(end-diffLen):len(self.data)-start]
+				self.UTR3 = self.data[len(self.data)-start:]
 		else :
 			self.UTR5 = ''
 			self.CDS = ''
 			self.UTR3 = ''
 
+		self.sequence = ''.join(self.data)
+
 	def _load_bin_sequence(self) :
 		self.bin_sequence = NucBinarySequence(self.sequence)
 		self.bin_UTR5 =  NucBinarySequence(self.UTR5)
@@ -123,15 +123,15 @@ def find(self, sequence) :
 		"""return the position of the first occurance of sequence"""
 		return self.bin_sequence.find(sequence)
 
-	def findAll(self, seqence):
+	def findAll(self, sequence):
 		"""Returns a lits of all positions where sequence was found"""
 		return self.bin_sequence.findAll(sequence)
 
 	def findInCDS(self, sequence) :
 		"""return the position of the first occurance of sequence"""
 		return self.bin_CDS.find(sequence)
 
-	def findAllInCDS(self, seqence):
+	def findAllInCDS(self, sequence):
 		"""Returns a lits of all positions where sequence was found"""
 		return self.bin_CDS.findAll(sequence)
 

diff --git a/pyGeno/SNPFiltering.py b/pyGeno/SNPFiltering.py
@@ -25,15 +25,41 @@ def __init__(self, alleles, sources = {}) :
 class SequenceInsert(Sequence_modifiers) :
 	"""Represents an Insertion to be applied to the sequence"""
 
-	def __init__(self, bases, sources = {}) :
+	def __init__(self, bases, sources = {}, ref = '-') :
 		Sequence_modifiers.__init__(self, sources)
 		self.bases = bases
+		self.offset = 0
+
+		# Allow to use format like C/CCTGGAA(dbSNP) or CCT/CCTGGAA(samtools)
+		if ref != '-':
+			if ref == bases[:len(ref)]:
+				self.offset = len(ref) 
+				self.bases = self.bases[self.offset:]
+				#-1 because if the insertion are after the last nuc we go out of table
+				self.offset -= 1
+			else:
+				raise NotImplemented("This format of Insetion is not accepted. Please change your format, or implement your format in pyGeno.")
+
 
 class SequenceDel(Sequence_modifiers) :
 	"""Represents a Deletion to be applied to the sequence"""
-	def __init__(self, length, sources = {}) :
+	def __init__(self, length, sources = {}, ref = None, alt = '-') :
 		Sequence_modifiers.__init__(self, sources)
 		self.length = length
+		self.offset = 0
+
+		# Allow to use format like CCTGGAA/C(dbSNP) or CCTGGAA/CCT(samtools)
+		if alt != '-':
+			if ref is not None:
+				if alt == ref[:len(alt)]:
+					self.offset = len(alt)
+					self.length = self.length - len(alt)
+				else:
+					raise NotImplemented("This format of Deletion is not accepted. Please change your format, or implement your format in pyGeno.")
+			else:
+				raise Exception("You need to add a ref sequence in your call of SequenceDel. Or implement your format in pyGeno.")
+
+
 
 class SNPFilter(object) :
 	"""Abtract Class. All filters must inherit from me"""

diff --git a/pyGeno/Transcript.py b/pyGeno/Transcript.py
@@ -22,6 +22,7 @@ class Transcript_Raba(pyGenoRabaObject) :
 	start = rf.Primitive()
 	end = rf.Primitive()
 	coding = rf.Primitive()
+	biotype = rf.Primitive()
 
 	genome = rf.RabaObject('Genome_Raba')
 	chromosome = rf.RabaObject('Chromosome_Raba')
@@ -34,7 +35,15 @@ def _curate(self) :
 			self.name = self.name.upper()
 
 		self.length = abs(self.end - self.start)
-		if self.exons[0].CDS_start is not None and self.exons[-1].CDS_end is not None :
+		have_CDS_start = False
+		have_CDS_end = False
+		for exon in self.exons :
+			if exon.CDS_start is not None :
+				have_CDS_start = True
+			if exon.CDS_end is not None :
+				have_CDS_end = True
+
+		if have_CDS_start and have_CDS_end :
 			self.coding = True
 		else :
 			self.coding = False
@@ -47,7 +56,7 @@ class Transcript(pyGenoRabaObjectWrapper) :
 	def __init__(self, *args, **kwargs) :
 		pyGenoRabaObjectWrapper.__init__(self, *args, **kwargs)
 		self.exons = RLWrapper(self, Exon, self.wrapped_object.exons)
-		self._load_sequencesTriggers = set(["UTR5", "UTR3", "cDNA", "sequence"])
+		self._load_sequencesTriggers = set(["UTR5", "UTR3", "cDNA", "sequence", "data"])
 		self.exonsDict = {}
 
 	def _makeLoadQuery(self, objectType, *args, **coolArgs) :
@@ -70,14 +79,14 @@ def _makeLoadQuery(self, objectType, *args, **coolArgs) :
 
 		return pyGenoRabaObjectWrapper._makeLoadQuery(self, objectType, *args, **coolArgs)
 
-	def _load_sequences(self) :
+	def _load_data(self) :
 		def getV(k) :
 			return pyGenoRabaObjectWrapper.__getattribute__(self, k)
 
 		def setV(k, v) :
 			return pyGenoRabaObjectWrapper.__setattr__(self, k, v)
 
-		sequence = []
+		self.data = []
 		cDNA = []
 		UTR5 = []
 		UTR3 = []
@@ -87,20 +96,20 @@ def setV(k, v) :
 			e = pyGenoRabaObjectWrapper_metaclass._wrappers[Exon_Raba](wrapped_object_and_bag = (ee, getV('bagKey')))
 			self.exonsDict[(e.start, e.end)] = e
 			exons.append(e)
-			sequence.append(e.sequence)
-
+			self.data.extend(e.data)
+			
 			if e.hasCDS() :
-				UTR5.append(e.UTR5)
-				cDNA.append(e.CDS)
-				UTR3.append(e.UTR3)
+				UTR5.append(''.join(e.UTR5))
+				cDNA.append(''.join(e.CDS))
+				UTR3.append(''.join(e.UTR3))
 				prime5 = False
 			else :
 				if prime5 :
-					UTR5.append(e.sequence)
+					UTR5.append(''.join(e.data))
 				else :
-					UTR3.append(e.sequence)
+					UTR3.append(''.join(e.data))
 
-		sequence = ''.join(sequence)
+		sequence = ''.join(self.data)
 		cDNA = ''.join(cDNA)
 		UTR5 = ''.join(UTR5)
 		UTR3 = ''.join(UTR3)

diff --git a/pyGeno/bootstrap_data/__init__.py b/pyGeno/bootstrap_data/__init__.py
diff --git a/pyGeno/configuration.py b/pyGeno/configuration.py
@@ -89,8 +89,8 @@ def pyGeno_init() :
 		os.makedirs(pyGeno_SETTINGS_DIR)
 
 	pyGeno_SETTINGS_PATH = getSettingsPath()
-	pyGeno_RABA_DBFILE = os.path.normpath('%s/pyGenoRaba.db' % pyGeno_SETTINGS_PATH)
-	pyGeno_DATA_PATH = os.path.normpath('%s/data' % pyGeno_SETTINGS_PATH)
+	pyGeno_RABA_DBFILE = os.path.normpath( os.path.join(pyGeno_SETTINGS_PATH, "pyGenoRaba.db") )
+	pyGeno_DATA_PATH = os.path.normpath( os.path.join(pyGeno_SETTINGS_PATH, "data") )
 
 	if not os.path.exists(pyGeno_SETTINGS_PATH) :
 		os.makedirs(pyGeno_SETTINGS_PATH)

diff --git a/pyGeno/doc/source/index.rst b/pyGeno/doc/source/index.rst
@@ -11,10 +11,9 @@ pyGeno: A Python package for precision medicine and proteogenomics
 .. image:: http://depsy.org/api/package/pypi/pyGeno/badge.svg
    :alt: depsy
    :target: http://depsy.org/package/python/pyGeno
-
-.. image:: https://img.shields.io/pypi/dm/pyGeno.svg
-   :alt: downloads
-   :target: https://pypi.python.org/pypi/pyGeno
+.. image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg
+    :target: https://opensource.org/licenses/Apache-2.0
+.. image:: https://img.shields.io/badge/python-2.7-blue.svg 
 
 pyGeno's `lair is on Github`_.
 
@@ -114,8 +113,8 @@ Contents:
    installation
    bootstraping
    querying
-   datawraps
    importation
+   datawraps
    objects
    snp_filter
    tools