From 4d380fab032fbeaacec88ad1d21547a2ca44cbe6 Mon Sep 17 00:00:00 2001 From: mdehoon Date: Thu, 6 Feb 2025 21:44:09 +0900 Subject: [PATCH] use ignore_sequences instead of gaps_only (#4929) * update * update * update --------- Co-authored-by: Michiel Jan Laurens de Hoon --- Bio/Align/__init__.py | 23 +++++++++++++---------- Doc/Tutorial/chapter_align.rst | 27 ++++++++++++++++++++++++--- Tests/test_Align_Alignment.py | 6 ++++-- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/Bio/Align/__init__.py b/Bio/Align/__init__.py index f7356ece6ac..c7fe0b7d530 100644 --- a/Bio/Align/__init__.py +++ b/Bio/Align/__init__.py @@ -3723,7 +3723,7 @@ def substitutions(self): start1, start2 = end1, end2 return m - def counts(self, substitution_matrix=None, gaps_only=False): + def counts(self, substitution_matrix=None, ignore_sequences=False): """Count the number of identities, mismatches, and gaps of an alignment. Arguments: @@ -3733,12 +3733,13 @@ def counts(self, substitution_matrix=None, gaps_only=False): (typically from the ``Bio.Align.substitution_matrices`` submodule) to also calculate the number of positive matches in an amino acid alignment. - - gaps_only - If True, do not calculate the number of identities, + - ignore_sequences - If True, do not calculate the number of identities, positives, and mismatches, but only calculate the - number of gaps. This will speed up the calculation. + number of aligned sequences and number of gaps + to speed up the calculation. Default value: False. - A ValueError is raised if gaps_only is True and substitution_matrix is not None. + A ValueError is raised if ignore_sequences is True and substitution_matrix is not None. >>> aligner = PairwiseAligner(mode='global', match_score=2, mismatch_score=-1) >>> for alignment in aligner.align("TACCG", "ACG"): @@ -3793,7 +3794,7 @@ def counts(self, substitution_matrix=None, gaps_only=False): right_insertions = right_deletions = 0 internal_insertions = internal_deletions = 0 aligned = 0 - if gaps_only: + if ignore_sequences: identities = None mismatches = None else: @@ -3801,8 +3802,10 @@ def counts(self, substitution_matrix=None, gaps_only=False): mismatches = 0 if substitution_matrix is None: positives = None - elif gaps_only: - raise ValueError("gaps_only cannot be True if substitution_matrix is used") + elif ignore_sequences: + raise ValueError( + "ignore_sequences cannot be True if substitution_matrix is used" + ) else: positives = 0 sequences = [None] * len(self.sequences) @@ -3813,7 +3816,7 @@ def counts(self, substitution_matrix=None, gaps_only=False): for i, sequence in enumerate(self.sequences): start = min(coordinates[i, :]) end = max(coordinates[i, :]) - if not gaps_only: + if not ignore_sequences: try: sequence = sequence[start:end] except ValueError: @@ -3823,10 +3826,10 @@ def counts(self, substitution_matrix=None, gaps_only=False): if sum(aligned_steps > 0) > sum(aligned_steps < 0): coordinates[i, :] = coordinates[i, :] - start else: - if not gaps_only: + if not ignore_sequences: sequence = reverse_complement(sequence) coordinates[i, :] = end - coordinates[i, :] - if gaps_only: + if ignore_sequences: sequences[i] = None else: try: diff --git a/Doc/Tutorial/chapter_align.rst b/Doc/Tutorial/chapter_align.rst index 3b79921e6f4..869380a36d8 100644 --- a/Doc/Tutorial/chapter_align.rst +++ b/Doc/Tutorial/chapter_align.rst @@ -581,9 +581,10 @@ alignment are indicated by -1: Counting identities, mismatches, and gaps ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The ``counts`` method counts the number of identities, mismatches, and gaps -(insertions and deletions) of an alignment. The return value is an -``AlignmentCounts`` object, from which the counts can be obtained as properties. +The ``counts`` method counts the number of identities, mismatches, aligned +letters, and agaps (insertions and deletions) of an alignment. The return +value is an ``AlignmentCounts`` object, from which the counts can be obtained +as properties. .. cont-doctest @@ -656,6 +657,26 @@ number of gaps (= insertions + deletions): >>> counts.internal_gaps 2 +To speed up the calculation, you can use ``ignore_sequences=True`` to skip +counting the number of matches and mismatches (this will still calculate the +number of aligned sequences): + +.. cont-doctest + +.. code:: pycon + + >>> counts = alignment.counts(ignore_sequences=True) + >>> counts.aligned + 16 + >>> print(counts.identities) + None + >>> print(counts.mismatches) + None + >>> counts.insertions + 1 + >>> counts.deletions + 5 + For protein alignments, in addition to the number of identities and mismatches, you can also count the number of positive matches by supplying a substitution matrix (see Chapter :ref:`sec:substitution_matrices`): diff --git a/Tests/test_Align_Alignment.py b/Tests/test_Align_Alignment.py index 761ed7594af..d99a336360d 100644 --- a/Tests/test_Align_Alignment.py +++ b/Tests/test_Align_Alignment.py @@ -2376,13 +2376,15 @@ def test_counts(self): str(counts), "AlignmentCounts(left_insertions=0, left_deletions=0, internal_insertions=0, internal_deletions=0, right_insertions=80, right_deletions=4, aligned=3084, identities=3020, mismatches=64, positives=None)", ) - counts = alignment.counts(gaps_only=True) + counts = alignment.counts(ignore_sequences=True) self.assertEqual( str(counts), "AlignmentCounts(left_insertions=0, left_deletions=0, internal_insertions=0, internal_deletions=0, right_insertions=80, right_deletions=4, aligned=3084, identities=None, mismatches=None, positives=None)", ) with self.assertRaises(ValueError): - alignment.counts(substitution_matrix=substitution_matrix, gaps_only=True) + alignment.counts( + substitution_matrix=substitution_matrix, ignore_sequences=True + ) for i, sequence in enumerate(alignment.sequences): length = len(sequence) alignment.sequences[i] = Seq(None, length)