From 92c39431983cbf63e0748bd5aced5ef38c902158 Mon Sep 17 00:00:00 2001 From: Ryan Wick Date: Tue, 6 Jun 2023 16:49:32 +1000 Subject: [PATCH] Bump version --- README.md | 91 +++++++++++++++++++---------------------- badread/__main__.py | 54 ++++++++++++++++-------- badread/identities.py | 47 +++++++++++++++++++-- badread/settings.py | 1 + badread/simulate.py | 2 +- badread/version.py | 2 +- test/test_cli.py | 50 +++++++++++++++------- test/test_identities.py | 30 ++++++++++++++ 8 files changed, 190 insertions(+), 87 deletions(-) diff --git a/README.md b/README.md index 57f4bc1..ad744e8 100644 --- a/README.md +++ b/README.md @@ -12,24 +12,24 @@ Badread is published in the [Journal of Open Source Software](http://joss.theoj. ## Table of contents - * [Requirements](#requirements) - * [Installation](#installation) - * [Quick usage](#quick-usage) - * [Method](#method) - * [Detailed usage](#detailed-usage) - * [Command line](#command-line) - * [Reference FASTA](#reference-fasta) - * [Fragment lengths](#fragment-lengths) - * [Read identities](#read-identities) - * [Error model](#error-model) - * [QScore model](#qscore-model) - * [Adapters](#adapters) - * [Junk and random reads](#junk-and-random-reads) - * [Chimeras](#chimeras) - * [Small plasmid bias](#small-plasmid-bias) - * [Glitches](#glitches) - * [Contributing](#contributing) - * [License](#license) +* [Requirements](#requirements) +* [Installation](#installation) +* [Quick usage](#quick-usage) +* [Method](#method) +* [Detailed usage](#detailed-usage) + * [Command line](#command-line) + * [Reference FASTA](#reference-fasta) + * [Fragment lengths](#fragment-lengths) + * [Read identities](#read-identities) + * [Error model](#error-model) + * [QScore model](#qscore-model) + * [Adapters](#adapters) + * [Junk and random reads](#junk-and-random-reads) + * [Chimeras](#chimeras) + * [Small plasmid bias](#small-plasmid-bias) + * [Glitches](#glitches) +* [Contributing](#contributing) +* [License](#license) @@ -90,29 +90,29 @@ badread simulate --reference ref.fasta --quantity 50x \ To simulate older Oxford Nanopore reads (R9.4.1, worse basecalling): ```bash badread simulate --reference ref.fasta --quantity 50x \ ---error_model pacbio2016 --qscore_model nanopore2020 --identity 90,98,5 \ + --error_model nanopore2020 --qscore_model nanopore2020 --identity 90,98,5 \ | gzip > reads.fastq.gz ``` -Alternatively, you can use Badread's built-in models to imitate older PacBio reads. This command also adjusts the identity and length distributions to be a bit more PacBio-2016-like: +Very bad reads: ```bash -badread simulate --reference ref.fasta --quantity 50x \ - --error_model pacbio2016 --qscore_model pacbio2016 --identity 85,95,3 --length 7500,7500 \ +badread simulate --reference ref.fasta --quantity 50x --glitches 1000,100,100 \ + --junk_reads 5 --random_reads 5 --chimeras 10 --identity 80,90,6 --length 4000,2000 \ | gzip > reads.fastq.gz ``` -Very bad reads: +Pretty good reads: ```bash -badread simulate --reference ref.fasta --quantity 50x --glitches 1000,100,100 \ - --junk_reads 5 --random_reads 5 --chimeras 10 --identity 75,90,8 \ +badread simulate --reference ref.fasta --quantity 50x --glitches 10000,10,10 \ + --junk_reads 0.1 --random_reads 0.1 --chimeras 0.1 --identity 20,3 \ | gzip > reads.fastq.gz ``` -Very nice reads: +Very good reads: ```bash badread simulate --reference ref.fasta --quantity 50x --error_model random \ - --qscore_model ideal --glitches 0,0,0 --junk_reads 0 --random_reads 0 \ - --chimeras 0 --identity 99,100,2 --start_adapter_seq "" --end_adapter_seq "" \ + --qscore_model ideal --glitches 0,0,0 --junk_reads 0 --random_reads 0 --chimeras 0 \ + --identity 30,3 --length 40000,20000 --start_adapter_seq "" --end_adapter_seq "" \ | gzip > reads.fastq.gz ``` @@ -225,19 +225,11 @@ For a couple of examples, check out [the reference FASTA page on the wiki](https ### Fragment lengths -Badread generates fragment lengths from a [gamma distribution](https://en.wikipedia.org/wiki/Gamma_distribution). While a gamma distribution is usually parameterised with shape and scale (_k_ and _θ_) or shape and rate (_α_ and _β_), I don't find these particularly intuitive. So Badread instead defines the fragment lengths using mean and standard deviation. +Badread generates fragment lengths from a [gamma distribution](https://en.wikipedia.org/wiki/Gamma_distribution). While a gamma distribution is usually parameterised with shape and scale (_k_ and _θ_) or shape and rate (_α_ and _β_), I don't find these particularly intuitive. So Badread instead defines fragment lengths using mean and standard deviation. -There are two ways to think about fragment lengths: the distribution of the fragment lengths and the distribution of _bases_ in the fragments. The latter distribution is higher because larger fragments contribute more bases. The read N50 is the median of the base (red) distribution – half the bases will be in reads shorter than this and half in longer reads. +There are two ways to think about fragment lengths: the distribution of the fragment lengths and the distribution of _bases_ in the fragments. The latter distribution is higher because larger fragments contribute more bases. The read N50 is the median of the base distribution – half the bases will be in reads shorter than this and half in longer reads. - - - - -
- Default length distribution - Badread's default is --length 15000,13000 (mean=15000, stdev=13000) which corresponds to a decent Nanopore run (N50=22.6 kbp). The fragment length distribution is in blue, while the base distribution is in red.

- To see the equations and interactively explore how different parameters affect the distributions, check out this Desmos plot. -
+Badread's default is `--length 15000,13000` (mean=15000, stdev=13000) which corresponds to a decent Nanopore run (N50=22.6 kbp). To see the equations and interactively explore how different parameters affect the distributions, check out [this Desmos plot](https://www.desmos.com/calculator/z2a3yqssie). Note that these parameters control the length of the _fragments_, not the final _reads_. These differ because: adapters are added to fragments, glitches can lengthen/shorten fragments, adding read errors can change the length (especially if the error model is biased towards insertions or deletions) and chimeras are made by concatenating multiple fragments together. @@ -245,17 +237,16 @@ Note that these parameters control the length of the _fragments_, not the final ### Read identities -Badread generates read identities from a [beta distribution](https://en.wikipedia.org/wiki/Beta_distribution). Like with fragment lengths, Badread defines the distribution with mean and standard deviation instead of using the more formal (and less intuitive) shape parameters (_α_ and _β_). In addition, Badread scales the distribution down using a maximum value, for a total of three parameters: `mean,max,stdev`. - - - - - -
- Default identity distribution - Badread's default is --identity 87.5,97.5,5 which corresponds to an okay (but not great) Nanopore sequencing run.

- To see the equations and interactively explore how different parameters affect the distribution, check out this Desmos plot. -
+Badread can generate read identities in two alternative ways. + +The first way uses a [beta distribution](https://en.wikipedia.org/wiki/Beta_distribution) to sample read identities. Like with fragment lengths, Badread defines the distribution with mean and standard deviation instead of using the more formal (and less intuitive) shape parameters (_α_ and _β_). In addition, Badread scales the distribution down using a maximum value, for a total of three parameters. To use this method, give three comma-delimited values (identity mean, max, stdev), e.g. `--identity 95,99,2.5`. + +The second way uses a [normal distribution](https://en.wikipedia.org/wiki/Normal_distribution) to sample read [qscores](https://en.wikipedia.org/wiki/Phred_quality_score). To use this method, give two comma-delimited values (qscore mean and stdev), e.g. `--identity 20,4`. This approach is better suited to high accuracy reads like ONT duplex or PacBio HiFi. + +Badread's default is `--identity 95,99,2.5` which corresponds to an okay (but not great) R10.4.1 Nanopore sequencing run. To see the equations and interactively explore how different parameters affect the distribution, check out these Desmos plots: +* [three-parameter identity beta distribution](https://www.desmos.com/calculator/u9fivqmisa) +* [two-parameter qscore normal distribution (qscore on x-axis)](https://www.desmos.com/calculator/kpganphxj0) +* [two-parameter qscore normal distribution (identity on x-axis)](https://www.desmos.com/calculator/kg9s0dvbup) For detail on how Badread defines identity, check out [this page on the wiki](https://github.com/rrwick/Badread/wiki/Definition-of-identity). diff --git a/badread/__main__.py b/badread/__main__.py index f2bc3b9..68a78ba 100644 --- a/badread/__main__.py +++ b/badread/__main__.py @@ -97,7 +97,8 @@ def simulate_subparser(subparsers): help='Fragment length distribution (mean and stdev, ' 'default: DEFAULT)') sim_args.add_argument('--identity', type=str, default='95,99,2.5', - help='Sequencing identity distribution (mean, max and stdev, ' + help='Sequencing identity distribution (mean, max and stdev for beta ' + 'distribution, or mean and stdev for normal qscore distribution , ' 'default: DEFAULT)') sim_args.add_argument('--error_model', type=str, default='nanopore2023', help='Can be "nanopore2018", "nanopore2020", "nanopore2023", ' @@ -274,24 +275,20 @@ def check_simulate_args(args): try: identity_parameters = [float(x) for x in args.identity.split(',')] - args.mean_identity = identity_parameters[0] - args.max_identity = identity_parameters[1] - args.identity_stdev = identity_parameters[2] + if len(identity_parameters) == 2: + args.mean_identity = identity_parameters[0] + args.max_identity = None + args.identity_stdev = identity_parameters[1] + check_qscore_identities(args) + elif len(identity_parameters) == 3: + args.mean_identity = identity_parameters[0] + args.max_identity = identity_parameters[1] + args.identity_stdev = identity_parameters[2] + check_beta_identities(args) + else: + sys.exit('Error: could not parse --identity values') except (ValueError, IndexError): sys.exit('Error: could not parse --identity values') - if args.mean_identity > 100.0: - sys.exit('Error: mean read identity cannot be more than 100') - if args.max_identity > 100.0: - sys.exit('Error: max read identity cannot be more than 100') - if args.mean_identity <= settings.MIN_MEAN_READ_IDENTITY: - sys.exit(f'Error: mean read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}') - if args.max_identity <= settings.MIN_MEAN_READ_IDENTITY: - sys.exit(f'Error: max read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}') - if args.mean_identity > args.max_identity: - sys.exit(f'Error: mean identity ({args.mean_identity}) cannot be larger than max ' - f'identity ({args.max_identity})') - if args.identity_stdev < 0.0: - sys.exit('Error: read identity stdev cannot be negative') try: glitch_parameters = [float(x) for x in args.glitches.split(',')] @@ -315,6 +312,29 @@ def check_simulate_args(args): sys.exit('Error: --end_adapter_seq must be a DNA sequence or a number') +def check_beta_identities(args): + if args.mean_identity > 100.0: + sys.exit('Error: mean read identity cannot be more than 100') + if args.max_identity > 100.0: + sys.exit('Error: max read identity cannot be more than 100') + if args.mean_identity <= settings.MIN_MEAN_READ_IDENTITY: + sys.exit(f'Error: mean read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}') + if args.max_identity <= settings.MIN_MEAN_READ_IDENTITY: + sys.exit(f'Error: max read identity must be at least {settings.MIN_MEAN_READ_IDENTITY}') + if args.mean_identity > args.max_identity: + sys.exit(f'Error: mean identity ({args.mean_identity}) cannot be larger than max ' + f'identity ({args.max_identity})') + if args.identity_stdev < 0.0: + sys.exit('Error: read identity stdev cannot be negative') + + +def check_qscore_identities(args): + if args.mean_identity <= settings.MIN_MEAN_READ_QSCORE: + sys.exit(f'Error: mean read identity must be at least {settings.MIN_MEAN_READ_QSCORE}') + if args.identity_stdev < 0.0: + sys.exit('Error: read qscore stdev cannot be negative') + + def check_python_version(): if sys.version_info.major < 3 or sys.version_info.minor < 6: sys.exit('Error: Badread requires Python 3.6 or later') diff --git a/badread/identities.py b/badread/identities.py index 98aa99d..e0c4bf7 100755 --- a/badread/identities.py +++ b/badread/identities.py @@ -1,6 +1,5 @@ """ -This module contains a class for describing read identity distributions (described by the beta -distribution) and related functions. +This module contains a class for describing read identity distributions and related functions. Copyright 2018 Ryan Wick (rrwick@gmail.com) https://github.com/rrwick/Badread @@ -23,11 +22,26 @@ class Identities(object): def __init__(self, mean, stdev, max_identity, output=sys.stderr): + self.mean, self.stdev, self.max_identity = None, None, None + self.beta_a, self.beta_b = None, None + + print('', file=output) + + # There are two possible types of identity distributions: a three-parameter beta + # distribution that describes read identities, or a two-parameter normal distribution that + # describes read qscores. + if max_identity is None: + self.type = "normal" + self.set_up_normal(mean, stdev, output) + else: + self.type = "beta" + self.set_up_beta(mean, stdev, max_identity, output) + + def set_up_beta(self, mean, stdev, max_identity, output): # Divide by 100 to convert from percentage to fraction self.mean = mean / 100.0 self.stdev = stdev / 100.0 self.max_identity = max_identity / 100.0 - print('', file=output) if self.mean == self.max_identity: self.beta_a, self.beta_b = None, None @@ -35,7 +49,7 @@ def __init__(self, mean, stdev, max_identity, output=sys.stderr): elif self.stdev == 0.0: self.max_identity = self.mean print(f'Using a constant read identity of {self.mean * 100}%', file=output) - else: # beta distribution + else: print('Generating read identities from a beta distribution:', file=output) self.beta_a, self.beta_b = beta_parameters(mean, stdev, max_identity) print_in_two_columns(f' mean = {float_to_str(self.mean * 100):>3}%', @@ -47,12 +61,37 @@ def __init__(self, mean, stdev, max_identity, output=sys.stderr): output=output) quickhist_beta(self.beta_a, self.beta_b, self.max_identity, 8, output=output) + def set_up_normal(self, mean, stdev, output): + self.mean = mean + self.stdev = stdev + + if self.stdev == 0.0: + self.max_identity = self.mean + print(f'Using a constant read qscore of {self.mean}', file=output) + else: + print('Generating read qscores from a normal distribution:', file=output) + print(f' mean = {float_to_str(self.mean):>3}', file=output) + print(f' stdev = {float_to_str(self.stdev):>3}', file=output) + def get_identity(self): + while True: + if self.type == "beta": + identity = self.get_beta_identity() + else: + identity = self.get_normal_identity() + if 0 <= identity <= 100: + return identity + + def get_beta_identity(self): if self.mean == self.max_identity: return self.mean else: # beta distribution return self.max_identity * np.random.beta(self.beta_a, self.beta_b) + def get_normal_identity(self): + qscore = np.random.normal(self.mean, self.stdev) + return 1.0 - 10**(-qscore / 10) + def beta_parameters(beta_mean, beta_stdev, beta_max): u, s, m = beta_mean, beta_stdev, beta_max diff --git a/badread/settings.py b/badread/settings.py index edcb002..7d80310 100644 --- a/badread/settings.py +++ b/badread/settings.py @@ -29,6 +29,7 @@ # identity (e.g. 50%) as that might break some things. These settings control how low they can go. MIN_MEAN_READ_LENGTH = 100 MIN_MEAN_READ_IDENTITY = 50 +MIN_MEAN_READ_QSCORE = 5 # If a random qscore model is used, this controls the range of the qualities. diff --git a/badread/simulate.py b/badread/simulate.py index 41518e8..492afbd 100644 --- a/badread/simulate.py +++ b/badread/simulate.py @@ -72,7 +72,7 @@ def simulate(args, output=sys.stderr): info.append(f'length={len(seq)}') info.append(f'error-free_length={len(fragment)}') - info.append(f'read_identity={actual_identity * 100.0:.2f}%') + info.append(f'read_identity={actual_identity * 100.0:.3f}%') read_name = uuid.UUID(int=random.getrandbits(128)) info = ' '.join(info) diff --git a/badread/version.py b/badread/version.py index 5cb304d..aae7198 100644 --- a/badread/version.py +++ b/badread/version.py @@ -14,4 +14,4 @@ If not, see . """ -__version__ = '0.3.0' +__version__ = '0.4.0' diff --git a/test/test_cli.py b/test/test_cli.py index 941f98d..29d5848 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -184,7 +184,7 @@ def test_simulate_help_2(self): self.assertEqual(out.getvalue().strip(), '') self.assertTrue('simulate' in err.getvalue().strip()) - def test_simulate_args(self): + def test_simulate_args_1(self): args = badread.__main__.parse_args(['simulate', '--reference', self.ref, '--quantity', '50x', '--length', '5432,123', '--identity', '78,89,4', '--glitches', '9876,12,34']) @@ -200,6 +200,22 @@ def test_simulate_args(self): self.assertEqual(args.glitch_size, 12) self.assertEqual(args.glitch_skip, 34) + def test_simulate_args_2(self): + args = badread.__main__.parse_args(['simulate', '--reference', self.ref, + '--quantity', '250M', '--length', '2345,321', + '--identity', '20,2', '--glitches', '6789,21,43']) + self.assertEqual(args.reference, self.ref) + self.assertEqual(args.quantity, '250M') + badread.__main__.check_simulate_args(args) + self.assertEqual(args.mean_frag_length, 2345) + self.assertEqual(args.frag_length_stdev, 321) + self.assertEqual(args.mean_identity, 20) + self.assertEqual(args.max_identity, None) + self.assertEqual(args.identity_stdev, 2) + self.assertEqual(args.glitch_rate, 6789) + self.assertEqual(args.glitch_size, 21) + self.assertEqual(args.glitch_skip, 43) + def test_bad_simulated_args_1(self): self.check_simulate_error(['--chimera', '101'], 'cannot be greater than') @@ -232,45 +248,51 @@ def test_bad_simulated_args_10(self): self.check_simulate_error(['--identity', '85,A,5'], 'could not parse') def test_bad_simulated_args_11(self): - self.check_simulate_error(['--identity', '85,101,5'], 'cannot be more than') + self.check_simulate_error(['--identity', '20,19,18,17'], 'could not parse') def test_bad_simulated_args_12(self): - self.check_simulate_error(['--identity', '101,102,5'], 'cannot be more than') + self.check_simulate_error(['--identity', '85,101,5'], 'cannot be more than') def test_bad_simulated_args_13(self): - self.check_simulate_error(['--identity', '85,80,5'], 'cannot be larger than max') + self.check_simulate_error(['--identity', '101,102,5'], 'cannot be more than') def test_bad_simulated_args_14(self): - self.check_simulate_error(['--identity', '85,90,-3'], 'cannot be negative') + self.check_simulate_error(['--identity', '85,80,5'], 'cannot be larger than max') def test_bad_simulated_args_15(self): - self.check_simulate_error(['--identity', '30,90,5'], 'must be at least') + self.check_simulate_error(['--identity', '85,90,-3'], 'cannot be negative') def test_bad_simulated_args_16(self): - self.check_simulate_error(['--identity', '90,30,5'], 'must be at least') + self.check_simulate_error(['--identity', '30,90,5'], 'must be at least') def test_bad_simulated_args_17(self): - self.check_simulate_error(['--glitches', 'abc'], 'could not parse') + self.check_simulate_error(['--identity', '90,30,5'], 'must be at least') def test_bad_simulated_args_18(self): - self.check_simulate_error(['--glitches', '500,Z,10'], 'could not parse') + self.check_simulate_error(['--identity', '1,1'], 'must be at least') def test_bad_simulated_args_19(self): - self.check_simulate_error(['--glitches', '500,-10,10'], 'must contain non-negative') + self.check_simulate_error(['--glitches', 'abc'], 'could not parse') def test_bad_simulated_args_20(self): - self.check_simulate_error(['--error_model', 'not_a_file'], 'or a filename') + self.check_simulate_error(['--glitches', '500,Z,10'], 'could not parse') def test_bad_simulated_args_21(self): - self.check_simulate_error(['--qscore_model', 'not_a_file'], 'or a filename') + self.check_simulate_error(['--glitches', '500,-10,10'], 'must contain non-negative') def test_bad_simulated_args_22(self): - self.check_simulate_error([], 'not a file', ref='not_a_file') + self.check_simulate_error(['--error_model', 'not_a_file'], 'or a filename') def test_bad_simulated_args_23(self): - self.check_simulate_error(['--start_adapter_seq', 'ACGTQ'], 'must be a DNA sequence') + self.check_simulate_error(['--qscore_model', 'not_a_file'], 'or a filename') def test_bad_simulated_args_24(self): + self.check_simulate_error([], 'not a file', ref='not_a_file') + + def test_bad_simulated_args_25(self): + self.check_simulate_error(['--start_adapter_seq', 'ACGTQ'], 'must be a DNA sequence') + + def test_bad_simulated_args_26(self): self.check_simulate_error(['--end_adapter_seq', '12ACT'], 'must be a DNA sequence') def test_good_simulated_args_1(self): diff --git a/test/test_identities.py b/test/test_identities.py index 3afa512..1f34f6f 100644 --- a/test/test_identities.py +++ b/test/test_identities.py @@ -89,3 +89,33 @@ def test_bad_identity(self): identities = badread.identities.Identities(81.9, 5.5, 82.1, output=self.null) identities.get_identity() self.assertTrue('invalid beta parameters' in str(cm.exception)) + + +class TestNormalIdentity(unittest.TestCase): + + def setUp(self): + self.null = open(os.devnull, 'w') + self.trials = 100000 + + def tearDown(self): + self.null.close() + + def test_normal_identity_1(self): + identities = badread.identities.Identities(20, 2, None, output=self.null) + mean = sum(identities.get_identity() for _ in range(self.trials)) / self.trials + self.assertAlmostEqual(mean, 0.98888, delta=0.01) + + def test_normal_identity_2(self): + identities = badread.identities.Identities(10, 0, None, output=self.null) + mean = sum(identities.get_identity() for _ in range(self.trials)) / self.trials + self.assertAlmostEqual(mean, 0.9, delta=0.01) + + def test_normal_identity_3(self): + identities = badread.identities.Identities(20, 0, None, output=self.null) + mean = sum(identities.get_identity() for _ in range(self.trials)) / self.trials + self.assertAlmostEqual(mean, 0.99, delta=0.01) + + def test_normal_identity_4(self): + identities = badread.identities.Identities(30, 0, None, output=self.null) + mean = sum(identities.get_identity() for _ in range(self.trials)) / self.trials + self.assertAlmostEqual(mean, 0.999, delta=0.01)