Skip to content

Commit

Permalink
4 samtools fastq threads, restores space before /1, decontaminating--…
Browse files Browse the repository at this point in the history
…>cleaning, +1 test
  • Loading branch information
bede committed Nov 10, 2023
1 parent 4431bdd commit 78940bf
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 17 deletions.
10 changes: 5 additions & 5 deletions src/hostile/aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def gen_clean_cmd(
for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])
rename_cmd = (
' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int((NR+1)/2)" "; print $0}}\''
' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int(NR)" "; print $0}}\''
if rename
else ""
)
Expand All @@ -108,10 +108,10 @@ def gen_clean_cmd(
f" | samtools view -f 4 -"
# Count reads in stream after filtering
f" | tee >(samtools view -F 256 -c - > '{count_after_path}')"
# Optionally replace paired read headers with integers
# Optionally replace read headers with integers
f"{rename_cmd}"
# Stream remaining records into fastq files
f" | samtools fastq --threads 2 -c 6 -0 '{fastq_out_path}'"
f" | samtools fastq --threads 4 -c 6 -0 '{fastq_out_path}'"
)
return cmd

Expand Down Expand Up @@ -153,7 +153,7 @@ def gen_paired_clean_cmd(
for k in cmd_template.keys():
alignment_cmd = alignment_cmd.replace(k, cmd_template[k])
rename_cmd = (
f' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int((NR+1)/2)""; print $0}}\''
f' | awk \'BEGIN{{FS=OFS="\\t"}} {{$1=int((NR+1)/2)" "; print $0}}\''
if rename
else ""
)
Expand All @@ -169,6 +169,6 @@ def gen_paired_clean_cmd(
# Optionally replace paired read headers with integers
f"{rename_cmd}"
# Stream remaining records into fastq files
f" | samtools fastq --threads 2 -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}'"
f" | samtools fastq --threads 4 -c 6 -N -1 '{fastq1_out_path}' -2 '{fastq2_out_path}'"
)
return cmd
10 changes: 5 additions & 5 deletions src/hostile/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,11 @@ def clean_fastqs(
for fastq in fastqs
]
logging.info("Cleaning…")
util.run_bash_parallel(backend_cmds, description="Decontaminating…")
util.run_bash_parallel(backend_cmds, description="Cleaning…")
stats = gather_stats(
rename=rename, fastqs=fastqs, out_dir=out_dir, aligner=aligner.name, index=index
)
logging.info("Finished decontamination")
logging.info("Finished cleaning")
return stats


Expand Down Expand Up @@ -263,12 +263,12 @@ def clean_paired_fastqs(
for pair in fastqs
]
logging.debug(f"{backend_cmds=}")
logging.info("Started decontamination")
util.run_bash_parallel(backend_cmds, description="Decontaminating…")
logging.info("Cleaning…")
util.run_bash_parallel(backend_cmds, description="Cleaning")
stats = gather_stats_paired(
rename=rename, fastqs=fastqs, out_dir=out_dir, aligner=aligner.name, index=index
)
logging.info("Finished decontamination")
logging.info("Finished cleaning")
return stats


Expand Down
8 changes: 8 additions & 0 deletions tests/data/tuberculosis_2.fastq
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@Mycobacterium_tuberculosis_1
TCCCGTCGTAAGCATCGATTCCGACGCGCTGGATGCTGCCCGCATGCTCGCAGAGCATCGTCTGCCTGGACTATTGGTCACCGCCGGAGCGGGCAAACAGTATGCGGTACTCCCTGCCTCACAGGTCGTGCGCTTCATCGTGCCCCGCTG
+
5430361214022-5224244425513232342544661102331222222/0231322106422524012261/22022/32253121.4240226412221023435/3264252222142422302224322324/222224/2322
@Mycobacterium_tuberculosis_2
CCCGTCGTAAGCATCGATTCCGACGCGCTGGATGCTGCCCGCATGCTCGCAGAGCATCGTCTGCCTGGACTATTGGTCACCGCCGGAGCGGGCAAACAGTATGCGGTACTCCCTGCCTCACAGGTCGTGCGCTTCATCGTGCCCCGCTAA
+
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
33 changes: 26 additions & 7 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ def run(cmd: str, cwd: Path = Path()): # Helper for CLI testing
)


def get_first_line_of_gzip_file(file_path):
def get_nth_line_of_gzip_file(file_path, line_number=1):
with gzip.open(file_path, "rt") as fh:
return fh.readline().strip()
for i, line in enumerate(fh, start=1):
if i == line_number:
return line.strip()
return None


def test_version_cli():
Expand Down Expand Up @@ -234,10 +237,26 @@ def test_rename():
rename=True,
out_dir=out_dir,
)
first_line = get_first_line_of_gzip_file(
out_dir / "tuberculosis_1_1.clean.fastq.gz"
first_line = get_nth_line_of_gzip_file(out_dir / "tuberculosis_1_1.clean.fastq.gz")
assert first_line == "@1"
assert stats[0]["rename"] == True
shutil.rmtree(out_dir, ignore_errors=True)


def test_rename_two_records():
stats = lib.clean_fastqs(
fastqs=[data_dir / "tuberculosis_2.fastq"],
aligner=lib.ALIGNER.bowtie2,
index=data_dir / "sars-cov-2/sars-cov-2",
rename=True,
out_dir=out_dir,
)
first_line = get_nth_line_of_gzip_file(out_dir / "tuberculosis_2.clean.fastq.gz")
fifth_line = get_nth_line_of_gzip_file(
out_dir / "tuberculosis_2.clean.fastq.gz", line_number=5
)
assert first_line == "@1"
assert fifth_line == "@2"
assert stats[0]["rename"] == True
shutil.rmtree(out_dir, ignore_errors=True)

Expand All @@ -255,10 +274,10 @@ def test_paired_rename():
rename=True,
out_dir=out_dir,
)
first_line = get_first_line_of_gzip_file(
first_line = get_nth_line_of_gzip_file(
out_dir / "tuberculosis_1_2.clean_1.fastq.gz"
)
assert first_line == "@1/1"
assert first_line == "@1 /1"
assert stats[0]["rename"] == True
shutil.rmtree(out_dir, ignore_errors=True)

Expand Down Expand Up @@ -306,7 +325,7 @@ def test_no_rename():
out_dir=out_dir,
force=True,
)
first_line = get_first_line_of_gzip_file(
first_line = get_nth_line_of_gzip_file(
(out_dir / "tuberculosis_1_2.clean_1.fastq.gz").resolve()
)
assert first_line == "@Mycobacterium_tuberculosis/1"
Expand Down

0 comments on commit 78940bf

Please sign in to comment.