Skip to content

Commit

Permalink
Enable create_test_subset.py to update @RG SM:{id} values
Browse files Browse the repository at this point in the history
  • Loading branch information
jmarshall committed Jul 11, 2023
1 parent e917f27 commit 1c37469
Showing 1 changed file with 45 additions and 3 deletions.
48 changes: 45 additions & 3 deletions scripts/create_test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,20 @@
This is in addition to the number of families specified in
--families and the number of samples specified in -n""",
)
@click.option(
'--embedded-ids/--no-embedded-ids',
'embedded_ids',
default=True,
help='Update IDs embedded within files (currently for CRAM only)',
)
def main(
project: str,
samples_n: Optional[int],
families_n: Optional[int],
skip_ped: Optional[bool] = True,
additional_families: Optional[tuple[str]] = None,
additional_samples: Optional[tuple[str]] = None,
embedded_ids: Optional[bool] = False,
):
"""
Script creates a test subset for a given project.
Expand Down Expand Up @@ -306,6 +313,7 @@ def main(
analysis['output'],
project,
(str(s['id']), new_sample_map[s['id']]),
embedded_ids,
),
status=AnalysisStatus(analysis['status']),
sample_ids=[new_sample_map[s['id']]],
Expand Down Expand Up @@ -629,7 +637,30 @@ def _get_random_families(
return returned_families


def _copy_files_in_dict(d, dataset: str, sid_replacement: tuple[str, str] = None):
def _rewrite_file(old_path: str, new_path: str, new_base_path: str, sid: tuple[str, str]):
if new_path.endswith('.cram'):
return f"""
gsutil cp {old_path!r} - |
samtools reheader --no-PG -c 'sed /^@RG/s/SM:{sid[0]}/SM:{sid[1]}/g' /dev/stdin |
gsutil cp - {new_path!r}
"""
elif new_path.endswith('.crai'):
return f"""
gsutil cp {new_base_path!r} - | samtools index -o - - | gsutil cp - {new_path!r}
"""
elif new_path.endswith('.md5'):
logger.info(f'Rewriting to {new_path}: check MD5 format')
return f"""
gsutil cp {new_base_path!r} - | md5sum | gsutil cp - {new_path!r}
"""
else:
logger.info(f'Copying to {new_path} without any rewriting')
return f'gsutil cp {old_path!r} {new_path!r}'


def _copy_files_in_dict(
d, dataset: str, sid_replacement: tuple[str, str] = None, embedded_ids: bool = False
):
"""
Replaces all `gs://cpg-{project}-main*/` paths
into `gs://cpg-{project}-test*/` and creates copies if needed
Expand All @@ -653,17 +684,28 @@ def _copy_files_in_dict(d, dataset: str, sid_replacement: tuple[str, str] = None
new_path = new_path.replace(sid_replacement[0], sid_replacement[1])

if not file_exists(new_path):
cmd = f'gsutil cp {old_path!r} {new_path!r}'
if embedded_ids and sid_replacement is not None:
cmd = _rewrite_file(old_path, new_path, new_path, sid_replacement)
else:
cmd = f'gsutil cp {old_path!r} {new_path!r}'

logger.info(f'Copying file in metadata: {cmd}')
subprocess.run(cmd, check=False, shell=True)
else:
if embedded_ids and sid_replacement is not None and new_path.endswith('.cram'):
logger.error(f'IDs embedded in {new_path} not updated: already exists')

extra_exts = ['.md5']
if new_path.endswith('.vcf.gz'):
extra_exts.append('.tbi')
if new_path.endswith('.cram'):
extra_exts.append('.crai')
for ext in extra_exts:
if file_exists(old_path + ext) and not file_exists(new_path + ext):
cmd = f'gsutil cp {old_path + ext!r} {new_path + ext!r}'
if embedded_ids and sid_replacement is not None:
cmd = _rewrite_file(old_path + ext, new_path + ext, new_path, sid_replacement)
else:
cmd = f'gsutil cp {old_path + ext!r} {new_path + ext!r}'
logger.info(f'Copying extra file in metadata: {cmd}')
subprocess.run(cmd, check=False, shell=True)
return new_path
Expand Down

0 comments on commit 1c37469

Please sign in to comment.