diff --git a/server/pyproject.toml b/server/pyproject.toml index 4b6bb256..7c29a058 100755 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -42,9 +42,6 @@ dependencies = [ [project.scripts] upsert = "scimodom.database.maintenance.upsert:main" -add-project = "scimodom.database.maintenance.add_project:main" -add-dataset = "scimodom.database.maintenance.add_dataset:main" -add-all = "scimodom.database.maintenance.add_all:main" [project.urls] Github = "https://github.com/dieterich-lab/scimodom" diff --git a/server/src/scimodom/app.py b/server/src/scimodom/app.py index 9e9851f7..bd2874ed 100644 --- a/server/src/scimodom/app.py +++ b/server/src/scimodom/app.py @@ -9,6 +9,7 @@ from scimodom.database.database import make_session, init from scimodom.frontend import frontend from scimodom.plugins.cli import ( + add_annotation, add_assembly, add_project, add_dataset, @@ -45,6 +46,17 @@ def assembly(id): """ add_assembly(id) + @app.cli.command( + "annotation", epilog="Check docs at https://dieterich-lab.github.io/scimodom/." + ) + @click.argument("id", type=click.INT) + def annotation(id): + """Prepare annotation. + + ID is the annotation_id (must already exists). + """ + add_annotation(id) + @app.cli.command( "project", epilog="Check docs at https://dieterich-lab.github.io/scimodom/." ) diff --git a/server/src/scimodom/database/maintenance/add_all.py b/server/src/scimodom/database/maintenance/add_all.py deleted file mode 100644 index f5ae7676..00000000 --- a/server/src/scimodom/database/maintenance/add_all.py +++ /dev/null @@ -1,219 +0,0 @@ -#! /usr/bin/env python3 - -"""Wrapper script to add projects and dataset -from project templates (json only). - -NOTE: For maintainers to batch add data, but these extra (optional) -fields are required in the template: file, data_title. -This script is not particularly efficient nor safe, but it is not -intended for general usage: we just want to batch add some data -to the DB in the short-term. -""" - -from argparse import ArgumentParser, SUPPRESS -from collections import defaultdict -from concurrent.futures import ProcessPoolExecutor -from functools import partial -import json -import logging -import os -from pathlib import Path -from subprocess import Popen, PIPE - -from scimodom.config import Config -from scimodom.database.database import make_session -from scimodom.services.project import ProjectService -import scimodom.utils.utils as utils - -logger = logging.getLogger(__name__) - -extra_cols = ["file", "data_title"] - - -def _get_templates(args): - all_paths = [] - for f in args.project_template: - path = Path(args.directory, f"{f}.json") - if not path.is_file(): - msg = f"Template {path} missing. Skipping!" - logger.error(msg) - continue - all_paths.append(path) - return all_paths - - -def _get_projects(templates): - all_projects = [] - for template in templates: - handle = open(template, "r") - project = json.load(handle) - project["path"] = template.as_posix() - for d in utils.to_list(project["metadata"]): - try: - utils.check_keys_exist(d.keys(), extra_cols) - except: - msg = f"Missing keys in {template} for metadata. Skipping project alltogether!" - logger.error(msg) - break - else: - all_projects.append(project) - handle.close() - return all_projects - - -def _get_dataset(project): - d = defaultdict(list) - for metadata in project["metadata"]: - d[metadata["file"]].append(metadata) - return d - - -def _add_dataset(key, data, smid, args): - metadata = data[key] - d = metadata[0] - if len(metadata) > 1: - # assume all remaining entries are identical... - # this might not be true... - d["rna"] = " ".join([m["rna"] for m in metadata]) - d["modomics_id"] = " ".join([m["modomics_id"] for m in metadata]) - call = [ - "add-dataset", - "-smid", - smid, - "--title", - f'"{d["data_title"]}"', - "--file", - Path(args.directory, key).as_posix(), - "-o", - str(d["organism"]["taxa_id"]), - "-a", - d["organism"]["assembly"], - "-m", - d["modomics_id"], - "-rna", - d["rna"], - "--modomics", - "-t", - f'"{d["tech"]}"', - "-cto", - f'"{d["organism"]["cto"]}"', - "-db", - args.database, - ] - p = Popen(f"printf 'Y' | {' '.join(call)}", stdout=PIPE, stderr=PIPE, shell=True) - stdout, stderr = p.communicate() - - return key, (p.returncode, stdout, stderr) - - -def main(): - parser = ArgumentParser( - add_help=False, description="""Add project and dataset from template.""" - ) - - required = parser.add_argument_group("required arguments") - optional = parser.add_argument_group("optional arguments") - - required.add_argument( - "-d", - "--directory", - help="Directory where project templates and files are located.", - type=str, - required=True, - ) - - required.add_argument( - "-pt", - "--project-template", - help="Space-separated list of project templates (file name only w/o extension)", - nargs="+", - type=str, - required=True, - ) - - optional.add_argument( - "-h", - "--help", - action="help", - default=SUPPRESS, - help="show this help message and exit", - ) - - optional.add_argument( - "-db", - "--database", - help="Database URI", - type=str, - default=Config.DATABASE_URI, - ) - - optional.add_argument( - "--map-async", - help="""Concurrent calls to 'add-dataset'. Caution: - filling of the /tmp space due to concurrent pybedtools - operations may occur!""", - action="store_true", - ) - - utils.add_log_opts(parser) - args = parser.parse_args() - utils.update_logging(args) - - engine, session_factory = make_session(args.database) - session = session_factory() - - templates = _get_templates(args) - projects = _get_projects(templates) - - # add projects - for project in projects: - try: - service = ProjectService(session, project) - service.create_project() - project["SMID"] = service.get_smid() - except Exception as e: - msg = ( - f"Failed to add project template {project['path']}. " - f"Exception is {e}. Skipping!" - ) - logger.error(msg) - - # add data - for project in projects: - smid = project.get("SMID", None) - if smid is None: - msg = f"Missing SMID! Failed to add project data for {project['title']}. Skipping!" - logger.error(msg) - continue - metadata = _get_dataset(project) - if not args.map_async: - for key in metadata.keys(): - _, ret = _add_dataset(key, metadata, smid, args) - msg = ( - f"Subprocess returned with {ret[0]} for dataset {key}. " - f"Traceback stdout: {ret[1]}. " - f"Traceback stderr: {ret[2]}." - ) - if not ret[0] == 0: - logger.error(msg) - else: - logger.warning(msg) - else: - with ProcessPoolExecutor(max_workers=os.cpu_count()) as ppe: - for key, ret in ppe.map( - partial(_add_dataset, data=metadata, smid=smid, args=args), - metadata.keys(), - ): - msg = ( - f"Subprocess returned with {ret[0]} for dataset {key}. " - f"Traceback stdout: {ret[1]}. " - f"Traceback stderr: {ret[2]}." - ) - if not ret[0] == 0: - logger.error(msg) - else: - logger.warning(msg) - - -if __name__ == "__main__": - main() diff --git a/server/src/scimodom/database/maintenance/add_dataset.py b/server/src/scimodom/database/maintenance/add_dataset.py deleted file mode 100644 index 335ffe6e..00000000 --- a/server/src/scimodom/database/maintenance/add_dataset.py +++ /dev/null @@ -1,351 +0,0 @@ -#! /usr/bin/env python3 - -"""Maintenance script for the DataService utility. -This script allows to create a new dataset for an existing -project. -""" - -from argparse import ArgumentParser, SUPPRESS -import logging -from pathlib import Path - -from sqlalchemy import select - -from scimodom.config import Config -from scimodom.database.database import make_session -from scimodom.database.models import ( - Project, - Taxa, - Assembly, - Modomics, - Modification, - DetectionTechnology, - Organism, - Selection, -) -import scimodom.database.queries as queries -from scimodom.services.annotation import AnnotationService -from scimodom.services.dataset import DataService -import scimodom.utils.utils as utils - -logger = logging.getLogger(__name__) - - -def main(): - parser = ArgumentParser( - add_help=False, - description="""Add new dataset to DB for SMID - create EUFID. Project should exists, incl. default setup.""", - ) - - required = parser.add_argument_group("required arguments") - optional = parser.add_argument_group("optional arguments") - - required.add_argument( - "-smid", - "--project-id", - help="""Existing project ID (SMID)""", - type=str, - required=True, - ) - - required.add_argument( - "--title", - help="""Dataset title""", - type=str, - required=True, - ) - - required.add_argument( - "--file", - help="""Path to bedRMod file""", - type=str, - required=True, - ) - - optional.add_argument( - "-h", - "--help", - action="help", - default=SUPPRESS, - help="show this help message and exit", - ) - - optional.add_argument( - "-db", - "--database", - help="Database URI", - type=str, - default=Config.DATABASE_URI, - ) - - optional.add_argument( - "-o", - "--organism", - help="""NCBI Taxa ID""", - type=int, - ) - - optional.add_argument( - "-a", - "--assembly", - help="""Valid name of assembly""", - type=str, - ) - - optional.add_argument( - "-m", - "--modification", - help="""Valid names of modifications (MODOMICS short name or MODOMICS ID if [--modomics])""", - nargs="+", - type=str, - ) - - optional.add_argument( - "--modomics", - help="""Use MODOMICS ID for [--modification]""", - action="store_true", - ) - - optional.add_argument( - "-rna", - "--rna-type", - help="""Valid name of RNA type, must match [--modification]""", - nargs="+", - type=str, - choices=["mRNA", "rRNA", "tRNA"], # TODO: FIX/UPDATE - ) - - optional.add_argument( - "-t", - "--technology", - help="""Valid name of technology. Name must be unique wrt to method, otherwise use [--technology-id]""", - type=str, - ) - - optional.add_argument( - "-cto", - "--cell-type", - help="""Valid name of cell, tissue, or organism, matched with [--organism]""", - type=str, - ) - - optional.add_argument( - "--assembly-id", - help="""Assembly ID. If given, overrides [--assembly]""", - type=int, - ) - - optional.add_argument( - "--modification-id", - help="""Modification ID. If given, overrides [--modification] and [--rna-type]""", - nargs="+", - type=int, - ) - - optional.add_argument( - "--technology-id", - help="""Technology ID. If given, overrides [--technology]""", - type=int, - ) - - optional.add_argument( - "--cto-id", help="""Organism ID. If given, overrides [--organism]""", type=int - ) - - utils.add_log_opts(parser) - args = parser.parse_args() - utils.update_logging(args) - - engine, session_factory = make_session(args.database) - session = session_factory() - # do quite a bit of validation... - - # is SMID valid? - try: - smid = session.execute( - select(Project.id).where(Project.id == args.project_id) - ).one() - except: - msg = f"Given project ID SMID={args.project_id} not found! Terminating!" - logger.error(msg) - return - smid = smid[0] - - # input file - if not Path(args.file).is_file(): - msg = ( - f"Given input file={args.file} not found, or not a valid file! Terminating!" - ) - logger.error(msg) - return - handle = open(args.file, "r") - - # is organism valid? - try: - taxa_id = session.execute(select(Taxa.id).where(Taxa.id == args.organism)).one() - except: - msg = f"Given organism Taxa ID={args.organism} not found! Terminating!" - logger.error(msg) - return - taxa_id = taxa_id[0] - - # is assembly valid? - if args.assembly_id: - ids = session.execute(select(Assembly.id)).scalars().all() - if args.assembly_id not in ids: - msg = f"Given assembly ID {args.assembly} not found! Terminating!" - logger.error(msg) - return - assembly_id = args.assembly_id - else: - try: - assembly_id = session.execute( - select(Assembly.id).where( - Assembly.name == args.assembly, Assembly.taxa_id == taxa_id - ) - ).one() - assembly_id = assembly_id[0] - except: - msg = f"Given assembly {args.assembly} with taxa ID={taxa_id} not found! Terminating!" - logger.error(msg) - return - - # are modifications valid? - if args.modification_id: - ids = session.execute(select(Modification.id)).scalars().all() - if not all(m in ids for m in args.modification_id): - msg = f"Some modification IDs {args.modification_id} were not found! Terminating!" - logger.error(msg) - return - modification_ids = args.modification_id - else: - if len(args.modification) != len(args.rna_type): - msg = "Number of [--modification] must match number of [--rna-type]! Terminating!" - logger.error(msg) - return - modification_ids = [] - for modification, rna in zip(args.modification, args.rna_type): - try: - if args.modomics: - modomics_id = modification - else: - modomics_id = session.execute( - select(Modomics.id).where(Modomics.short_name == modification) - ).scalar() - modification_id = session.execute( - select(Modification.id).where( - Modification.modomics_id == modomics_id, - Modification.rna == rna, - ) - ).one() - modification_ids.append(modification_id[0]) - except: - msg = f"Modification {modification} (RNA type = {rna}) not found! Terminating!" - logger.error(msg) - return - - # is technology valid? - # in principle if same tech with 2 meth, one() should raise an error... - if args.technology_id: - ids = session.execute(select(DetectionTechnology.id)).scalars().all() - if args.technology_id not in ids: - msg = f"Technology ID {args.technology_id} not found! Terminating!" - logger.error(msg) - return - technology_id = args.technology_id - else: - try: - technology_id = session.execute( - select(DetectionTechnology.id).where( - DetectionTechnology.tech == args.technology - ) - ).one() - technology_id = technology_id[0] - except: - msg = f"Given technology {args.technology} not found, or ambiguous selection due to method! Terminating!" - logger.error(msg) - return - - # is technology valid? - # in principle if same tech with 2 meth, one() should raise an error... - if args.cto_id: - ids = session.execute(select(Organism.id)).scalars().all() - if args.cto_id not in ids: - msg = f"Organism ID {args.cto_id} not found! Terminating!" - logger.error(msg) - return - cto_id = args.cto_id - else: - try: - cto_id = session.execute( - select(Organism.id).where( - Organism.cto == args.cell_type, - Organism.taxa_id == taxa_id, - ) - ).one() - cto_id = cto_id[0] - except: - msg = f"Given cell type {args.cell_type} for Taxa ID {taxa_id} not found! Terminating!" - logger.error(msg) - return - - # double check, are selections valid? - for idx in modification_ids: - try: - selection_id = session.execute( - select(Selection.id).where( - Selection.modification_id == idx, - Selection.technology_id == technology_id, - Selection.organism_id == cto_id, - ) - ).one() - except: - msg = "Given selection not found! Terminating!" - logger.error(msg) - return - - # call - service = DataService( - session, - smid, - args.title, - args.file, - handle, - taxa_id, - assembly_id, - modification_ids, - technology_id, - cto_id, - ) - - query = queries.query_column_where(Taxa, "name", filters={"id": taxa_id}) - organism = session.execute(query).scalar() - query = queries.query_column_where(Assembly, "name", filters={"id": assembly_id}) - assembly = session.execute(query).scalar() - modifications = [] - for idx in modification_ids: - records = session.execute( - select(Modification).where(Modification.id == idx) - ).scalar() - modifications.append(f"{records.modomics_id} ({records.rna})") - query = queries.query_column_where( - DetectionTechnology, "tech", filters={"id": technology_id} - ) - technology = session.execute(query).scalar() - query = queries.query_column_where(Organism, "cto", filters={"id": cto_id}) - cto = session.execute(query).scalar() - - msg = ( - f"Adding dataset for Taxa ID={organism}; Assembly={assembly}; Modifications={', '.join(modifications)}; " - f"Technology={technology}; and Cell/Tissue/Organ={cto} to project {smid} from {args.file}..." - ) - if not utils.confirm(msg): - return - eufid = service.create_dataset() - - service = AnnotationService(session, eufid=eufid) - service.annotate_data() - - -if __name__ == "__main__": - main() diff --git a/server/src/scimodom/database/maintenance/add_project.py b/server/src/scimodom/database/maintenance/add_project.py deleted file mode 100644 index 2cbdd56d..00000000 --- a/server/src/scimodom/database/maintenance/add_project.py +++ /dev/null @@ -1,76 +0,0 @@ -#! /usr/bin/env python3 - -"""Maintenance script for the ProjectService utility. -This script allows to create a new project from an existing -configuration json file. It calls the SetupService by default -and upsert all DB tables. -""" - -from argparse import ArgumentParser, SUPPRESS -import json -import logging - -from scimodom.config import Config -from scimodom.database.database import make_session -from scimodom.services.project import ProjectService -from scimodom.services.setup import SetupService -import scimodom.utils.utils as utils - -logger = logging.getLogger(__name__) - - -def main(): - parser = ArgumentParser( - add_help=False, description="""Add new project to DB - create SMID.""" - ) - - required = parser.add_argument_group("required arguments") - optional = parser.add_argument_group("optional arguments") - - required.add_argument( - "-p", - "--project", - help="""INSERT new project using [--project PROJECT], - where PROJECT is a json file with required fields""", - type=str, - required=True, - ) - - optional.add_argument( - "-h", - "--help", - action="help", - default=SUPPRESS, - help="show this help message and exit", - ) - - optional.add_argument( - "-db", - "--database", - help="Database URI", - type=str, - default=Config.DATABASE_URI, - ) - - utils.add_log_opts(parser) - args = parser.parse_args() - utils.update_logging(args) - - engine, session_factory = make_session(args.database) - session = session_factory() - setup = SetupService(session) - setup.upsert_all() - - # load project metadata - project = json.load(open(args.project)) - # add project - msg = f"Adding project ({args.project}) to {args.database}..." - if not utils.confirm(msg): - return - service = ProjectService(session, project) - service.create_project() - return service.get_smid() - - -if __name__ == "__main__": - main() diff --git a/server/src/scimodom/plugins/__init__.py b/server/src/scimodom/plugins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/server/src/scimodom/plugins/cli.py b/server/src/scimodom/plugins/cli.py index 0f89f00c..fa361432 100644 --- a/server/src/scimodom/plugins/cli.py +++ b/server/src/scimodom/plugins/cli.py @@ -16,6 +16,7 @@ Assembly, ) import scimodom.database.queries as queries +from scimodom.services.annotation import AnnotationService from scimodom.services.assembly import AssemblyService from scimodom.services.project import ProjectService from scimodom.services.dataset import DataService @@ -52,6 +53,29 @@ def add_assembly(assembly_id: int) -> None: session.close() +def add_annotation(annotation_id: int) -> None: + """Provides a CLI function to set up a new annotation. + This function does not add a new annotation to the database, + but merely creates the data structure. + + :param annotation_id: Annotation ID, must exists. + :type annotation_id: int + """ + session = get_session() + service = AnnotationService(session, annotation_id=annotation_id) + click.secho( + f"Preparing annotation for {service._taxid} ({service._release}) to {Config.DATABASE_URI}...", + fg="green", + ) + click.secho("Continue [y/n]?", fg="green") + c = click.getchar() + if c not in ["y", "Y"]: + return + service.create_annotation() + click.secho("Successfully created.", fg="green") + session.close() + + def add_project(project_template: str | Path) -> None: """Provides a CLI function to add a new project. @@ -269,7 +293,7 @@ def add_all(directory: Path, templates: list[str]) -> None: def _get_single(metadata, title): return ( metadata["rna"], - metadata["modomics_id"], + [metadata["modomics_id"]], metadata["method_id"], metadata["tech"], int(metadata["organism"]["taxa_id"]), diff --git a/server/src/scimodom/services/importer/header.py b/server/src/scimodom/services/importer/header.py index 95458a30..0aa8407d 100644 --- a/server/src/scimodom/services/importer/header.py +++ b/server/src/scimodom/services/importer/header.py @@ -185,7 +185,8 @@ def _validate_columns(self) -> None: """Validate if the file has the minimum number of columns. This does not validate the column names, as there can be any number of additional rows in - the header. An empty record will raise the same error. + the header, only the first row without tag is validated. + An empty first record will raise the same error. """ num_cols = len(self._specs["columns"]) cols = []