Bump version

rrwick · Jun 6, 2023 · 92c3943 · 92c3943
1 parent 24c8785
commit 92c3943
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -12,24 +12,24 @@ Badread is published in the [Journal of Open Source Software](http://joss.theoj.
 
 ## Table of contents
 
-  * [Requirements](#requirements)
-  * [Installation](#installation)
-  * [Quick usage](#quick-usage)
-  * [Method](#method)
-  * [Detailed usage](#detailed-usage)
-     * [Command line](#command-line)
-     * [Reference FASTA](#reference-fasta)
-     * [Fragment lengths](#fragment-lengths)
-     * [Read identities](#read-identities)
-     * [Error model](#error-model)
-     * [QScore model](#qscore-model)
-     * [Adapters](#adapters)
-     * [Junk and random reads](#junk-and-random-reads)
-     * [Chimeras](#chimeras)
-     * [Small plasmid bias](#small-plasmid-bias)
-     * [Glitches](#glitches)
-  * [Contributing](#contributing)
-  * [License](#license)
+* [Requirements](#requirements)
+* [Installation](#installation)
+* [Quick usage](#quick-usage)
+* [Method](#method)
+* [Detailed usage](#detailed-usage)
+  * [Command line](#command-line)
+  * [Reference FASTA](#reference-fasta)
+  * [Fragment lengths](#fragment-lengths)
+  * [Read identities](#read-identities)
+  * [Error model](#error-model)
+  * [QScore model](#qscore-model)
+  * [Adapters](#adapters)
+  * [Junk and random reads](#junk-and-random-reads)
+  * [Chimeras](#chimeras)
+  * [Small plasmid bias](#small-plasmid-bias)
+  * [Glitches](#glitches)
+* [Contributing](#contributing)
+* [License](#license)
 
 
 
@@ -90,29 +90,29 @@ badread simulate --reference ref.fasta --quantity 50x \
 To simulate older Oxford Nanopore reads (R9.4.1, worse basecalling):
 ```bash
 badread simulate --reference ref.fasta --quantity 50x \
---error_model pacbio2016 --qscore_model nanopore2020 --identity 90,98,5 \
+    --error_model nanopore2020 --qscore_model nanopore2020 --identity 90,98,5 \
     | gzip > reads.fastq.gz
 ```
 
-Alternatively, you can use Badread's built-in models to imitate older PacBio reads. This command also adjusts the identity and length distributions to be a bit more PacBio-2016-like:
+Very bad reads:
 ```bash
-badread simulate --reference ref.fasta --quantity 50x \
-    --error_model pacbio2016 --qscore_model pacbio2016 --identity 85,95,3 --length 7500,7500 \
+badread simulate --reference ref.fasta --quantity 50x --glitches 1000,100,100 \
+    --junk_reads 5 --random_reads 5 --chimeras 10 --identity 80,90,6 --length 4000,2000 \
     | gzip > reads.fastq.gz
 ```
 
-Very bad reads:
+Pretty good reads:
 ```bash
-badread simulate --reference ref.fasta --quantity 50x --glitches 1000,100,100 \
-    --junk_reads 5 --random_reads 5 --chimeras 10 --identity 75,90,8 \
+badread simulate --reference ref.fasta --quantity 50x --glitches 10000,10,10 \
+    --junk_reads 0.1 --random_reads 0.1 --chimeras 0.1 --identity 20,3 \
     | gzip > reads.fastq.gz
 ```
 
-Very nice reads:
+Very good reads:
 ```bash
 badread simulate --reference ref.fasta --quantity 50x --error_model random \
-    --qscore_model ideal --glitches 0,0,0 --junk_reads 0 --random_reads 0 \
-    --chimeras 0 --identity 99,100,2 --start_adapter_seq "" --end_adapter_seq "" \
+    --qscore_model ideal --glitches 0,0,0 --junk_reads 0 --random_reads 0 --chimeras 0 \
+    --identity 30,3 --length 40000,20000 --start_adapter_seq "" --end_adapter_seq "" \
     | gzip > reads.fastq.gz
 ```
 
@@ -225,37 +225,28 @@ For a couple of examples, check out [the reference FASTA page on the wiki](https
 
 ### Fragment lengths
 
-Badread generates fragment lengths from a [gamma distribution](https://en.wikipedia.org/wiki/Gamma_distribution). While a gamma distribution is usually parameterised with shape and scale (_k_ and _θ_) or shape and rate (_α_ and _β_), I don't find these particularly intuitive. So Badread instead defines the fragment lengths using mean and standard deviation.
+Badread generates fragment lengths from a [gamma distribution](https://en.wikipedia.org/wiki/Gamma_distribution). While a gamma distribution is usually parameterised with shape and scale (_k_ and _θ_) or shape and rate (_α_ and _β_), I don't find these particularly intuitive. So Badread instead defines fragment lengths using mean and standard deviation.
 
-There are two ways to think about fragment lengths: the distribution of the fragment lengths and the distribution of _bases_ in the fragments. The latter distribution is higher because larger fragments contribute more bases. The read N50 is the median of the base (red) distribution – half the bases will be in reads shorter than this and half in longer reads.
+There are two ways to think about fragment lengths: the distribution of the fragment lengths and the distribution of _bases_ in the fragments. The latter distribution is higher because larger fragments contribute more bases. The read N50 is the median of the base distribution – half the bases will be in reads shorter than this and half in longer reads.
 
-<table>
-    <tr>
-        <td>
-            <img align="right" src="images/default_frag_lengths.png" alt="Default length distribution" width="400">
-            Badread's default is <code>--length 15000,13000</code> (mean=15000, stdev=13000) which corresponds to a decent Nanopore run (N50=22.6 kbp). The fragment length distribution is in blue, while the base distribution is in red.<br><br>
-            To see the equations and interactively explore how different parameters affect the distributions, check out <a href="https://www.desmos.com/calculator/xrkqgzt4o5">this Desmos plot</a>.
-        </td>
-    </tr>
-</table>
+Badread's default is `--length 15000,13000` (mean=15000, stdev=13000) which corresponds to a decent Nanopore run (N50=22.6 kbp). To see the equations and interactively explore how different parameters affect the distributions, check out [this Desmos plot](https://www.desmos.com/calculator/z2a3yqssie).
 
 Note that these parameters control the length of the _fragments_, not the final _reads_. These differ because: adapters are added to fragments, glitches can lengthen/shorten fragments, adding read errors can change the length (especially if the error model is biased towards insertions or deletions) and chimeras are made by concatenating multiple fragments together.
 
 
 
 ### Read identities
 
-Badread generates read identities from a [beta distribution](https://en.wikipedia.org/wiki/Beta_distribution). Like with fragment lengths, Badread defines the distribution with mean and standard deviation instead of using the more formal (and less intuitive) shape parameters (_α_ and _β_). In addition, Badread scales the distribution down using a maximum value, for a total of three parameters: `mean,max,stdev`.
-
-<table>
-    <tr>
-        <td>
-            <img align="right" src="images/default_identities.png" alt="Default identity distribution" width="400">
-            Badread's default is <code>--identity 87.5,97.5,5</code> which corresponds to an okay (but not great) Nanopore sequencing run.<br><br>
-            To see the equations and interactively explore how different parameters affect the distribution, check out <a href="https://www.desmos.com/calculator/q7qw6rq2lb">this Desmos plot</a>.
-        </td>
-    </tr>
-</table>
+Badread can generate read identities in two alternative ways.
+
+The first way uses a [beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) to sample read identities. Like with fragment lengths, Badread defines the distribution with mean and standard deviation instead of using the more formal (and less intuitive) shape parameters (_α_ and _β_). In addition, Badread scales the distribution down using a maximum value, for a total of three parameters. To use this method, give three comma-delimited values (identity mean, max, stdev), e.g. `--identity 95,99,2.5`.
+
+The second way uses a [normal distribution](https://en.wikipedia.org/wiki/Normal_distribution) to sample read [qscores](https://en.wikipedia.org/wiki/Phred_quality_score). To use this method, give two comma-delimited values (qscore mean and stdev), e.g. `--identity 20,4`. This approach is better suited to high accuracy reads like ONT duplex or PacBio HiFi.
+
+Badread's default is `--identity 95,99,2.5` which corresponds to an okay (but not great) R10.4.1 Nanopore sequencing run. To see the equations and interactively explore how different parameters affect the distribution, check out these Desmos plots:
+* [three-parameter identity beta distribution](https://www.desmos.com/calculator/u9fivqmisa)
+* [two-parameter qscore normal distribution (qscore on x-axis)](https://www.desmos.com/calculator/kpganphxj0)
+* [two-parameter qscore normal distribution (identity on x-axis)](https://www.desmos.com/calculator/kg9s0dvbup)
 
 For detail on how Badread defines identity, check out [this page on the wiki](https://github.com/rrwick/Badread/wiki/Definition-of-identity).
 

diff --git a/badread/__main__.py b/badread/__main__.py
@@ -97,7 +97,8 @@ def simulate_subparser(subparsers):
                           help='Fragment length distribution (mean and stdev, '
                                'default: DEFAULT)')
     sim_args.add_argument('--identity', type=str, default='95,99,2.5',
-                          help='Sequencing identity distribution (mean, max and stdev, '
+                          help='Sequencing identity distribution (mean, max and stdev for beta '
+                               'distribution, or mean and stdev for normal qscore distribution , '
                                'default: DEFAULT)')
     sim_args.add_argument('--error_model', type=str, default='nanopore2023',
                           help='Can be "nanopore2018", "nanopore2020", "nanopore2023", '
@@ -274,24 +275,20 @@ def check_simulate_args(args):
 
     try:
         identity_parameters = [float(x) for x in args.identity.split(',')]
-        args.mean_identity = identity_parameters[0]
-        args.max_identity = identity_parameters[1]
-        args.identity_stdev = identity_parameters[2]
+        if len(identity_parameters) == 2:
+            args.mean_identity = identity_parameters[0]
+            args.max_identity = None
+            args.identity_stdev = identity_parameters[1]
+            check_qscore_identities(args)
+        elif len(identity_parameters) == 3:
+            args.mean_identity = identity_parameters[0]
+            args.max_identity = identity_parameters[1]
+            args.identity_stdev = identity_parameters[2]
+            check_beta_identities(args)
+        else:
+            sys.exit('Error: could not parse --identity values')
     except (ValueError, IndexError):
         sys.exit('Error: could not parse --identity values')
-    if args.mean_identity > 100.0:
-        sys.exit('Error: mean read identity cannot be more than 100')
-    if args.max_identity > 100.0:
-        sys.exit('Error: max read identity cannot be more than 100')
-    if args.mean_identity <= settings.MIN_MEAN_READ_IDENTITY:
-        sys.exit(f'Error: mean read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}')
-    if args.max_identity <= settings.MIN_MEAN_READ_IDENTITY:
-        sys.exit(f'Error: max read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}')
-    if args.mean_identity > args.max_identity:
-        sys.exit(f'Error: mean identity ({args.mean_identity}) cannot be larger than max '
-                 f'identity ({args.max_identity})')
-    if args.identity_stdev < 0.0:
-        sys.exit('Error: read identity stdev cannot be negative')
 
     try:
         glitch_parameters = [float(x) for x in args.glitches.split(',')]
@@ -315,6 +312,29 @@ def check_simulate_args(args):
                 sys.exit('Error: --end_adapter_seq must be a DNA sequence or a number')
 
 
+def check_beta_identities(args):
+    if args.mean_identity > 100.0:
+        sys.exit('Error: mean read identity cannot be more than 100')
+    if args.max_identity > 100.0:
+        sys.exit('Error: max read identity cannot be more than 100')
+    if args.mean_identity <= settings.MIN_MEAN_READ_IDENTITY:
+        sys.exit(f'Error: mean read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}')
+    if args.max_identity <= settings.MIN_MEAN_READ_IDENTITY:
+        sys.exit(f'Error: max read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}')
+    if args.mean_identity > args.max_identity:
+        sys.exit(f'Error: mean identity ({args.mean_identity}) cannot be larger than max '
+                 f'identity ({args.max_identity})')
+    if args.identity_stdev < 0.0:
+        sys.exit('Error: read identity stdev cannot be negative')
+
+
+def check_qscore_identities(args):
+    if args.mean_identity <= settings.MIN_MEAN_READ_QSCORE:
+        sys.exit(f'Error: mean read identity must be at least {settings.MIN_MEAN_READ_QSCORE}')
+    if args.identity_stdev < 0.0:
+        sys.exit('Error: read qscore stdev cannot be negative')
+
+
 def check_python_version():
     if sys.version_info.major < 3 or sys.version_info.minor < 6:
         sys.exit('Error: Badread requires Python 3.6 or later')

diff --git a/badread/identities.py b/badread/identities.py
@@ -1,6 +1,5 @@
 """
-This module contains a class for describing read identity distributions (described by the beta
-distribution) and related functions.
+This module contains a class for describing read identity distributions and related functions.
 
 Copyright 2018 Ryan Wick ([email protected])
 https://github.com/rrwick/Badread
@@ -23,19 +22,34 @@
 class Identities(object):
 
     def __init__(self, mean, stdev, max_identity, output=sys.stderr):
+        self.mean, self.stdev, self.max_identity = None, None, None
+        self.beta_a, self.beta_b = None, None
+
+        print('', file=output)
+
+        # There are two possible types of identity distributions: a three-parameter beta
+        # distribution that describes read identities, or a two-parameter normal distribution that
+        # describes read qscores.
+        if max_identity is None:
+            self.type = "normal"
+            self.set_up_normal(mean, stdev, output)
+        else:
+            self.type = "beta"
+            self.set_up_beta(mean, stdev, max_identity, output)
+
+    def set_up_beta(self, mean, stdev, max_identity, output):
         # Divide by 100 to convert from percentage to fraction
         self.mean = mean / 100.0
         self.stdev = stdev / 100.0
         self.max_identity = max_identity / 100.0
-        print('', file=output)
 
         if self.mean == self.max_identity:
             self.beta_a, self.beta_b = None, None
             print(f'Using a constant read identity of {self.mean * 100}%', file=output)
         elif self.stdev == 0.0:
             self.max_identity = self.mean
             print(f'Using a constant read identity of {self.mean * 100}%', file=output)
-        else:  # beta distribution
+        else:
             print('Generating read identities from a beta distribution:', file=output)
             self.beta_a, self.beta_b = beta_parameters(mean, stdev, max_identity)
             print_in_two_columns(f'  mean  = {float_to_str(self.mean * 100):>3}%',
@@ -47,12 +61,37 @@ def __init__(self, mean, stdev, max_identity, output=sys.stderr):
                                  output=output)
             quickhist_beta(self.beta_a, self.beta_b, self.max_identity, 8, output=output)
 
+    def set_up_normal(self, mean, stdev, output):
+        self.mean = mean
+        self.stdev = stdev
+
+        if self.stdev == 0.0:
+            self.max_identity = self.mean
+            print(f'Using a constant read qscore of {self.mean}', file=output)
+        else:
+            print('Generating read qscores from a normal distribution:', file=output)
+            print(f'  mean  = {float_to_str(self.mean):>3}', file=output)
+            print(f'  stdev = {float_to_str(self.stdev):>3}', file=output)
+
     def get_identity(self):
+        while True:
+            if self.type == "beta":
+                identity = self.get_beta_identity()
+            else:
+                identity = self.get_normal_identity()
+            if 0 <= identity <= 100:
+                return identity
+
+    def get_beta_identity(self):
         if self.mean == self.max_identity:
             return self.mean
         else:  # beta distribution
             return self.max_identity * np.random.beta(self.beta_a, self.beta_b)
 
+    def get_normal_identity(self):
+        qscore = np.random.normal(self.mean, self.stdev)
+        return 1.0 - 10**(-qscore / 10)
+
 
 def beta_parameters(beta_mean, beta_stdev, beta_max):
     u, s, m = beta_mean, beta_stdev, beta_max

diff --git a/badread/settings.py b/badread/settings.py
@@ -29,6 +29,7 @@
 # identity (e.g. 50%) as that might break some things. These settings control how low they can go.
 MIN_MEAN_READ_LENGTH = 100
 MIN_MEAN_READ_IDENTITY = 50
+MIN_MEAN_READ_QSCORE = 5
 
 
 # If a random qscore model is used, this controls the range of the qualities.

diff --git a/badread/simulate.py b/badread/simulate.py
@@ -72,7 +72,7 @@ def simulate(args, output=sys.stderr):
 
         info.append(f'length={len(seq)}')
         info.append(f'error-free_length={len(fragment)}')
-        info.append(f'read_identity={actual_identity * 100.0:.2f}%')
+        info.append(f'read_identity={actual_identity * 100.0:.3f}%')
 
         read_name = uuid.UUID(int=random.getrandbits(128))
         info = ' '.join(info)

diff --git a/badread/version.py b/badread/version.py
@@ -14,4 +14,4 @@
 If not, see <http://www.gnu.org/licenses/>.
 """
 
-__version__ = '0.3.0'
+__version__ = '0.4.0'