Skip to content

Commit

Permalink
Restore functionality for [Sequence] filters: Python 3 upgrades (#244)
Browse files Browse the repository at this point in the history
* port `[AnthonyNolanFilter]` to Python 3

* update `StringMatrix` class to store alleles as strings if passed as such
  fallback to using the original behaviour of `ndarray` if not a string.

* add a unit test to check types set as above

* move MSF files into the unit tests data area and restore the associated .pop file and .ini file for the `sequence-nopoptests.ini`

* updates for behaviour of `directory` key in `[Sequence]` and `[AnthonyNolan]` and documentation updates
  • Loading branch information
alexlancaster authored Jan 18, 2025
1 parent 90843cc commit bd3277e
Show file tree
Hide file tree
Showing 83 changed files with 6,317 additions and 63 deletions.
62 changes: 28 additions & 34 deletions src/PyPop/Filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
."""

import re
import string
import sys
from abc import ABC, abstractmethod
from functools import reduce
Expand Down Expand Up @@ -227,7 +226,7 @@ def __init__(
# name = "C"
allele = matchobj.group(2)

if self.alleleLookupTable.has_key(name):
if name in self.alleleLookupTable:
if allele not in self.alleleLookupTable[name]:
self.alleleLookupTable[name].append(allele)
else:
Expand Down Expand Up @@ -379,7 +378,7 @@ def addAllele(self, alleleName):

filteredAllele = self.translTable[alleleName]

if self.countTable.has_key(filteredAllele):
if filteredAllele in self.countTable:
self.countTable[filteredAllele] += 1
else:
self.countTable[filteredAllele] = 1
Expand All @@ -389,7 +388,7 @@ def endFirstPass(self):
print("translation table:", self.translTable)
print("count table:", self.countTable)

translKeys = self.translTable.keys()
translKeys = list(self.translTable.keys())

for allele in translKeys:
# check to see if we an allele of the form
Expand Down Expand Up @@ -478,7 +477,7 @@ def filterAllele(self, alleleName):
if subname != transl:
self.logFile.emptytag("translate", input=subname, output=transl)
self.logFile.writeln()
transl = string.join(transl_collection, "/")
transl = "/".join(transl_collection)
if alleleName != transl:
self.logFile.emptytag("translate", input=alleleName, output=transl)
self.logFile.writeln()
Expand Down Expand Up @@ -535,23 +534,22 @@ def makeSeqDictionaries(self, matrix=None, locus=None):
if match:
break
try:
self.length = int(string.split(match.group())[1])
self.length = int(match.group().split()[1])
except Exception:
# FIXME: How do we want to handle a non-existent MSF header alignment length
msg = f"could not find the alignment length from msf file {self.filename}."
raise RuntimeError(msg) from None

# see where the header of the MSF file ends (demarcated by // )
self.msfHead = 0
for line in self.lines:
if string.find(line, "//") != -1:
break
self.msfHead += 1
for index, line in enumerate(self.lines):
if "//" in line:
self.msfHead = index

for individ in self.matrix[the_locus]:
for allele in individ:
# if the allele hasn't been keyed yet, we'll have to get a sequence
if not self.sequences.has_key(allele):
if allele not in self.sequences:
# FIXME: this code is specific to HLA data
# find "null alleles" (ending in "N")
# it makes a null allele
Expand Down Expand Up @@ -612,15 +610,13 @@ def makeSeqDictionaries(self, matrix=None, locus=None):
if self.debug:
print("full sequence for locus", the_locus, self.sequences)

# Make the self.unsequenedSite (normally '#') the standard null placeholder
# Make the self.unsequencedSite (normally '#') the standard null placeholder
for allele in self.sequences:
##self.sequences[allele] = string.replace(self.sequences[allele],'.','*')
self.sequences[allele] = string.replace(
self.sequences[allele], ".", self.unsequencedSite
self.sequences[allele] = self.sequences[allele].replace(
".", self.unsequencedSite
)
##self.sequences[allele] = string.replace(self.sequences[allele],'X','*')
self.sequences[allele] = string.replace(
self.sequences[allele], "X", self.unsequencedSite
self.sequences[allele] = self.sequences[allele].replace(
"X", self.unsequencedSite
)

# pre-populates the polyseq dictionary with empty strings,
Expand Down Expand Up @@ -721,10 +717,10 @@ def translateMatrix(self, matrix=None):
]
else:
positionString[positionDigit] += " "
for line in range(len(positionString.keys()), 0, -1):
for line in range(len(list(positionString.keys())), 0, -1):
self.logFile.writeln("\t\t" + positionString[line])

li = alleleTally.keys()
li = list(alleleTally.keys())
li.sort()
for allele in li:
self.logFile.writeln(
Expand All @@ -751,7 +747,7 @@ def translateMatrix(self, matrix=None):
positionReportString = "Position: " + str(
self.polyseqpos[locus][position]
)
letters = positionTally.keys()
letters = list(positionTally.keys())
letters.sort()
for letter in letters:
positionReportString += (
Expand Down Expand Up @@ -821,7 +817,7 @@ def _genOffsets(self, locus, pos):
"DPB1": 29,
}

return str(pos - offsets[locus]) if offsets.has_key(locus) else str(pos)
return str(pos - offsets[locus]) if locus in offsets else str(pos)

def _getMSFLinesForLocus(self, locus):
# FIXME: this code is specific to hla data
Expand All @@ -846,9 +842,10 @@ def _getSequenceFromLines(self, locus=None, allele=None):
regexp = re.compile(".*" + re.escape(name) + " .*")
seq = ""
for line in self.lines[self.msfHead :]:
if string.find(line, name + " ") != -1:
if line.find(name + " ") != -1:
match = re.search(regexp, line)
seq = seq + string.join(string.split(match.group())[1:], "")
seq += "".join(match.group().split()[1:])
# seq = seq + string.join(string.split(match.group())[1:], "")

# check length of seq against what we expected from the msf header
if len(seq) < self.length:
Expand Down Expand Up @@ -914,10 +911,10 @@ def _getConsensusFromLines(self, locus=None, allele=None):
print(
f"{allele} NOT found in the msf file, {closestMatches.keys()[0]} is only close match, so using that."
)
seq = closestMatches.values()[0]
seq = next(iter(closestMatches.values()))

else:
for pos in range(len(closestMatches.values()[0])):
for pos in range(len(next(iter(closestMatches.values())))):
# checks each position of each allele, counts the number
# of unique characters (excepting . X and * characters)
uniqueCounter = {}
Expand Down Expand Up @@ -958,7 +955,7 @@ def _getConsensusFromLines(self, locus=None, allele=None):
print(seq)
print(
f"{allele} NOT found in the msf file, so we use a consensus of ",
closestMatches.keys(),
list(closestMatches.keys()),
)

return seq
Expand Down Expand Up @@ -992,7 +989,7 @@ def doDigitBinning(self, matrix=None):
individCount = 0
for individCount, individ in enumerate(matrix[locus]):
for i in range(2):
allele[i] = str(individ[i]) # FIXME: matrix type is `ndarray`
allele[i] = individ[i]
if (
allele[i] != self.untypedAllele
and len(allele[i]) > self.binningDigits
Expand Down Expand Up @@ -1021,9 +1018,6 @@ def doCustomBinning(self, matrix=None):
if locus.lower() in self.customBinningDict:
for individ in matrix[locus]:
for i in range(2):
individ[i] = str(
individ[i]
) # FIXME: matrix type is `ndarray` needs to be string for len() and other string operations
if len(individ[i].split("/")) > 1:
allele_collection = []
for subname in individ[i].split("/"):
Expand All @@ -1033,7 +1027,7 @@ def doCustomBinning(self, matrix=None):
)
]

allele[i] = string.join(list(set(allele_collection)), "/")
allele[i] = "/".join(list(set(allele_collection)))

else:
allele[i] = self.lookupCustomBinning(
Expand Down Expand Up @@ -1102,7 +1096,7 @@ def lookupCustomBinning(self, testAllele, locus):

if len(closeMatches) > 0:
bestScore = 1000
for match, score in closeMatches.items():
for match, score in list(closeMatches.items()):
if score < bestScore:
bestScore = score
finalMatch = match
Expand Down Expand Up @@ -1145,7 +1139,7 @@ def endFirstPass(self):

# now, translate alleles with count < lumpThreshold to "lump"

translKeys = self.translTable.keys()
translKeys = list(self.translTable.keys())

for allele in translKeys:
filteredAllele = self.translTable[allele]
Expand Down
54 changes: 42 additions & 12 deletions src/PyPop/Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

"""Python population genetics statistics."""

import os
import sys
import time
from configparser import ConfigParser, NoOptionError, NoSectionError
Expand Down Expand Up @@ -83,6 +84,35 @@ def getConfigInstance(configFilename=None, altpath=None):
return config


def get_sequence_directory(directory_str, debug=False):
path_obj = Path(directory_str)

# if the path is relative, resolve it to an absolute path if it exists
if not path_obj.is_absolute():
if path_obj.exists() and path_obj.is_dir():
path_obj = path_obj.resolve()
elif os.environ.get("PYPOP_CURRENT_TEST_DIRECTORY"):
# if we're running in a test environment, resolve paths relative to the parent of the "tests" directory
path_obj = (
Path(os.environ.get("PYPOP_CURRENT_TEST_DIRECTORY")).parent / path_obj
)
else:
sys.exit(
f"Relative path {path_obj} for AnthonyNolan sequence files does not exist or is not a directory."
)

# at this point, the path is absolute, now we need to check it exits
if path_obj.exists() and path_obj.is_dir():
anthonynolanPath = str(path_obj)
if debug:
print(f"Using {anthonynolanPath} for AnthonyNolan data files")
else:
sys.exit(
f"Absolute path {path_obj} for Anthony Nolan sequence files does not exist or is not a directory"
)
return anthonynolanPath


class Main:
"""Main interface to the PyPop modules.
Expand Down Expand Up @@ -524,13 +554,13 @@ def _runFilters(self):

if filterType == "AnthonyNolan":
try:
anthonynolanPath = self.config.get(filterCall, "directory")
anthonynolanPath = get_sequence_directory(
self.config.get(filterCall, "directory"), debug=self.debug
)
except Exception:
anthonynolanPath = Path(self.datapath) / "anthonynolan" / "msf"
if self.debug:
print(
f"LOG: Defaulting to system datapath {anthonynolanPath} for anthonynolanPath data"
)
sys.exit(
"Need to provide a path to the Anthony Nolan sequence files: no default"
)
try:
alleleFileFormat = self.config.get(filterCall, "alleleFileFormat")
except Exception:
Expand Down Expand Up @@ -624,13 +654,13 @@ def _runFilters(self):
except Exception:
sequenceFileSuffix = "_prot"
try:
anthonynolanPath = self.config.get(filterCall, "directory")
anthonynolanPath = get_sequence_directory(
self.config.get(filterCall, "directory"), debug=self.debug
)
except Exception:
anthonynolanPath = Path(self.datapath) / "anthonynolan" / "msf"
if self.debug:
print(
f"LOG: Defaulting to system datapath {anthonynolanPath} for anthonynolanPath data"
)
sys.exit(
"Need to provide a path to the Anthony Nolan sequence files: no default"
)
try:
sequenceFilterMethod = self.config.get(
filterCall, "sequenceConsensusMethod"
Expand Down
8 changes: 4 additions & 4 deletions src/PyPop/Utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,11 +589,11 @@ def __setitem__(self, index, value):
col1 = col * 2
col2 = col1 + 1
# store each element in turn
self.array[(row, col1 + self.extraCount)] = asarray(
value1, dtype=self.dtype
self.array[(row, col1 + self.extraCount)] = (
value1 if type(value1) is str else asarray(value1, dtype=self.dtype)
)
self.array[(row, col2 + self.extraCount)] = asarray(
value2, dtype=self.dtype
self.array[(row, col2 + self.extraCount)] = (
value2 if type(value2) is str else asarray(value2, dtype=self.dtype)
)

elif colName in self.extraList:
Expand Down
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,16 @@
# adapted from pytest documentation
# https://docs.pytest.org/en/latest/example/simple.html#control-skipping-of-tests-according-to-command-line-option

import os
from pathlib import Path

import pytest

# FIXME: a bit hacky
# set an environment variable for the current test directory
current_dir = Path(__file__).parent # get the current test script directory
os.environ["PYPOP_CURRENT_TEST_DIRECTORY"] = str(current_dir)


def pytest_addoption(parser):
parser.addoption(
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit bd3277e

Please sign in to comment.