diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 4b56aca9..5da28062 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -22,10 +22,18 @@ jobs: with: path: main + - name: get lineage databases + uses: actions/checkout@v3 + with: + repository: bacpop/beebop_data + path: ./main/storage + - name: Download and extract GPS database working-directory: ./main run: | ./scripts/download_db --small storage + + - name: Set up Python 3.9 uses: actions/setup-python@v2 @@ -44,31 +52,37 @@ jobs: source $CONDA/etc/profile.d/conda.sh conda activate beebop_py -# - name: Install poppunk -# run: conda install poppunk + - name: Install poppunk + run: conda install poppunk # currently the latest poppunk release is missing some functions required for beebop. For now installing from source: - - name: Get poppunk source code - uses: actions/checkout@v3 - with: - repository: bacpop/PopPUNK - ref: fix-json-serialisation - path: poppunk - - - name: install poppunk & dependencies - working-directory: ./poppunk + # - name: Get poppunk source code + # uses: actions/checkout@v3 + # with: + # repository: bacpop/PopPUNK + # path: poppunk + + # - name: install poppunk & dependencies + # working-directory: ./poppunk + # run: | + # source $CONDA/etc/profile.d/conda.sh + # conda activate beebop_py + # conda install graph-tool + # conda install mandrake + # conda install rapidnj + # sudo apt-get install libeigen3-dev + # sudo apt-get install libopenblas-dev + # sudo apt-get install -y '^libxcb.*-dev' libx11-xcb-dev libglu1-mesa-dev libxrender-dev libxi-dev libxkbcommon-dev libxkbcommon-x11-dev + # pip install joblib==1.1.0 + # conda install -c bioconda pp-sketchlib=2.0.0 + # pip3 install git+https://github.com/bacpop/PopPUNK + + - name: install dependencies run: | source $CONDA/etc/profile.d/conda.sh conda activate beebop_py - conda install graph-tool - conda install mandrake - conda install rapidnj - sudo apt-get install libeigen3-dev - sudo apt-get install libopenblas-dev + sudo apt-get install libeigen3-dev libopenblas-dev sudo apt-get install -y '^libxcb.*-dev' libx11-xcb-dev libglu1-mesa-dev libxrender-dev libxi-dev libxkbcommon-dev libxkbcommon-x11-dev - pip install joblib==1.1.0 - conda install -c bioconda pp-sketchlib=2.0.0 - pip3 install git+https://github.com/bacpop/PopPUNK - name: Install Poetry uses: snok/install-poetry@v1 @@ -117,7 +131,7 @@ jobs: # Uncomment the next three lines to debug on failure with # tmate. However, don't leave them uncommented on merge as that # causes failing builds to hang forever. - # - # - name: tmate session - # if: ${{ failure() }} - # uses: mxschmitt/action-tmate@v3 + + - name: tmate session + if: ${{ failure() }} + uses: mxschmitt/action-tmate@v3 diff --git a/.gitignore b/.gitignore index 791ee4ee..25eb6072 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ ignored tests/results storage/poppunk_output node_modules -storage/GPS_v4* \ No newline at end of file +storage/GPS_v4* +storage/strain_*_lineage_db \ No newline at end of file diff --git a/beebop/app.py b/beebop/app.py index 94d1da3f..bb1de864 100644 --- a/beebop/app.py +++ b/beebop/app.py @@ -11,8 +11,9 @@ import json import requests import pickle +import csv -from beebop import versions, assignClusters, visualise +from beebop import versions, assignClusters, assignLineages, visualise from beebop.filestore import PoppunkFileStore, DatabaseFileStore from beebop.utils import get_args import beebop.schemas @@ -65,11 +66,15 @@ def check_connection(redis) -> None: abort(500, description="Redis not found") -def generate_zip(path_folder: str, type: str, cluster: str) -> BytesIO: +def generate_zip(fs: PoppunkFileStore, + p_hash: str, + type: str, + cluster: str) -> BytesIO: """ [This generates a .zip folder with results data.] - :param path_folder: [path to folder to be zipped] + :param fs: [PoppunkFileStore with path to folder to be zipped] + :param p_hash: [project hash] :param type: [can be either 'microreact' or 'network'] :param cluster: [only relevant for 'network', since there are multiple component files stored in the folder, but only the right one should @@ -79,9 +84,14 @@ def generate_zip(path_folder: str, type: str, cluster: str) -> BytesIO: """ memory_file = BytesIO() if type == 'microreact': + path_folder = fs.output_microreact(p_hash, cluster) add_files(memory_file, path_folder) elif type == 'network': - file_list = (f'network_component_{cluster}.graphml', + path_folder = fs.output_network(p_hash) + with open(fs.network_mapping(p_hash), 'rb') as dict: + cluster_component_mapping = pickle.load(dict) + component = cluster_component_mapping[str(cluster)] + file_list = (f'network_component_{component}.graphml', 'network_cytoscape.csv', 'network_cytoscape.graphml') add_files(memory_file, path_folder, file_list) @@ -199,15 +209,32 @@ def run_poppunk_internal(sketches: dict, # check connection to redis check_connection(redis) # submit list of hashes to redis worker - job_assign = q.enqueue(assignClusters.get_clusters, - hashes_list, - p_hash, - fs, - db_paths, - args, - job_timeout=job_timeout) + job_assign_clusters = q.enqueue(assignClusters.get_clusters, + hashes_list, + p_hash, + fs, + db_paths, + args, + job_timeout=job_timeout) # save p-hash with job.id in redis server - redis.hset("beebop:hash:job:assign", p_hash, job_assign.id) + redis.hset("beebop:hash:job:assignClusters", + p_hash, + job_assign_clusters.id) + + # submit list of hashes to redis worker + job_assign_lineages = q.enqueue(assignLineages.get_lineages, + hashes_list, + p_hash, + fs, + db_paths, + args, + depends_on=job_assign_clusters, + job_timeout=job_timeout) + # save p-hash with job.id in redis server + redis.hset("beebop:hash:job:assignLineages", + p_hash, + job_assign_lineages.id) + # create visualisations # microreact job_microreact = q.enqueue(visualise.microreact, @@ -216,7 +243,8 @@ def run_poppunk_internal(sketches: dict, db_paths, args, name_mapping), - depends_on=job_assign, job_timeout=job_timeout) + depends_on=job_assign_clusters, + job_timeout=job_timeout) redis.hset("beebop:hash:job:microreact", p_hash, job_microreact.id) # network @@ -226,9 +254,11 @@ def run_poppunk_internal(sketches: dict, db_paths, args, name_mapping), - depends_on=job_assign, job_timeout=job_timeout) + depends_on=job_assign_clusters, + job_timeout=job_timeout) redis.hset("beebop:hash:job:network", p_hash, job_network.id) - return jsonify(response_success({"assign": job_assign.id, + return jsonify(response_success({"assign_clusters": job_assign_clusters.id, + "assign_lineages": job_assign_lineages.id, "microreact": job_microreact.id, "network": job_network.id})) @@ -262,16 +292,25 @@ def get_status_job(job, p_hash, redis): id = redis.hget(f"beebop:hash:job:{job}", p_hash).decode("utf-8") return Job.fetch(id, connection=redis).get_status() try: - status_assign = get_status_job('assign', p_hash, redis) - if status_assign == "finished": + status_assign_clusters = get_status_job('assignClusters', + p_hash, + redis) + if status_assign_clusters == "finished": + status_assign_lineages = get_status_job('assignLineages', + p_hash, + redis) status_microreact = get_status_job('microreact', p_hash, redis) status_network = get_status_job('network', p_hash, redis) else: + status_assign_lineages = "waiting" status_microreact = "waiting" status_network = "waiting" - return jsonify(response_success({"assign": status_assign, - "microreact": status_microreact, - "network": status_network})) + return jsonify(response_success({ + "assignClusters": status_assign_clusters, + "assignLineages": status_assign_lineages, + "microreact": status_microreact, + "network": status_network + })) except AttributeError: return jsonify(error=response_failure({ "error": "Unknown project hash"})), 500 @@ -290,7 +329,8 @@ def get_results(result_type) -> json: must be provided by the user in the frontend] :param result_type: [can be - - 'assign' for clusters + - 'assignClusters' for clusters + - 'assignLineages' for lineages - 'zip' for visualisation results as zip folders (with the json property 'type' specifying whether 'microreact' or 'network' results are required) @@ -299,9 +339,12 @@ def get_results(result_type) -> json: cluster] :return json: [response object with result stored in 'data'] """ - if result_type == 'assign': + if result_type == 'assignClusters': p_hash = request.json['projectHash'] return get_clusters_internal(p_hash, redis) + elif result_type == 'assignLineages': + p_hash = request.json['projectHash'] + return get_lineages_internal(p_hash, storage_location) elif result_type == 'zip': p_hash = request.json['projectHash'] visualisation_type = request.json['type'] @@ -338,7 +381,8 @@ def get_clusters_internal(p_hash: str, redis: Redis) -> json: """ check_connection(redis) try: - id = redis.hget("beebop:hash:job:assign", p_hash).decode("utf-8") + id = redis.hget("beebop:hash:job:assignClusters", + p_hash).decode("utf-8") job = Job.fetch(id, connection=redis) if job.result is None: return jsonify(error=response_failure({ @@ -350,6 +394,36 @@ def get_clusters_internal(p_hash: str, redis: Redis) -> json: "error": "Unknown project hash"})), 500 +def get_lineages_internal(p_hash: str, storage_location: str) -> json: + """ + [returns lineage assignment results] + + :param p_hash: [project hash]m + :param storage_location: [storage location] + :return json: [response object with lineage results stored in 'data'] + """ + fs = PoppunkFileStore(storage_location) + path_lineages = fs.lineages_csv(p_hash) + lineages = {} + + try: + with open(path_lineages) as csv_file: + csv_reader = csv.reader(csv_file, delimiter=',') + for row in csv_reader: + lineages[row[0]] = { + "rank1": row[2], + "rank2": row[3], + "rank3": row[4]} + result = jsonify(response_success(lineages)) + + except (FileNotFoundError): + result = jsonify(error=response_failure({ + "error": "File not found", + "detail": "Lineage results not found" + })), 500 + return result + + def send_zip_internal(p_hash: str, type: str, cluster: str, @@ -364,12 +438,8 @@ def send_zip_internal(p_hash: str, :return any: [zipfile] """ fs = PoppunkFileStore(storage_location) - if type == 'microreact': - path_folder = fs.output_microreact(p_hash, cluster) - elif type == 'network': - path_folder = fs.output_network(p_hash) # generate zipfile - memory_file = generate_zip(path_folder, type, cluster) + memory_file = generate_zip(fs, p_hash, type, cluster) return send_file(memory_file, download_name=type + '.zip', as_attachment=True) diff --git a/beebop/assignClusters.py b/beebop/assignClusters.py index e6711737..62313e58 100644 --- a/beebop/assignClusters.py +++ b/beebop/assignClusters.py @@ -1,25 +1,10 @@ from PopPUNK.web import summarise_clusters, sketch_to_hdf5 from PopPUNK.utils import setupDBFuncs -import re import os from beebop.poppunkWrapper import PoppunkWrapper from beebop.filestore import PoppunkFileStore, DatabaseFileStore - - -def hex_to_decimal(sketches_dict) -> None: - """ - [Converts all hexadecimal numbers in the sketches into decimal numbers. - These have been stored in hexadecimal format to not loose precision when - sending the sketches from the backend to the frontend] - - :param sketches_dict: [dictionary holding all sketches] - """ - for sample in list(sketches_dict.values()): - if type(sample['14'][0]) == str and re.match('0x.*', sample['14'][0]): - for x in range(14, 30, 3): - sample[str(x)] = list(map(lambda x: int(x, 16), - sample[str(x)])) +from beebop.utils import hex_to_decimal def get_clusters(hashes_list: list, @@ -63,15 +48,19 @@ def get_clusters(hashes_list: list, # run query assignment wrapper = PoppunkWrapper(fs, db_paths, args, p_hash) - wrapper.assign_clusters(dbFuncs, qc_dict, qNames) + isolateClustering = wrapper.assign_clusters(dbFuncs, qc_dict, qNames) + + # Process clustering + query_strains = {} + clustering_type = 'combined' + for isolate in isolateClustering[clustering_type]: + if isolate in qNames: + strain = isolateClustering[clustering_type][isolate] + if strain in query_strains: + query_strains[strain].append(isolate) + else: + query_strains[strain] = [isolate] - queries_names, queries_clusters, _, _, _, _, _ = \ - summarise_clusters(outdir, args.assign.species, db_paths.db, qNames) + summarise_clusters(outdir, args.assign.species, db_paths.db, qNames) - result = {} - for i, (name, cluster) in enumerate(zip(queries_names, queries_clusters)): - result[i] = { - "hash": name, - "cluster": cluster - } - return result + return query_strains diff --git a/beebop/assignLineages.py b/beebop/assignLineages.py new file mode 100644 index 00000000..b0d7cb19 --- /dev/null +++ b/beebop/assignLineages.py @@ -0,0 +1,115 @@ + +import pickle +from pathlib import PurePath +import shutil +from rq import get_current_job +from redis import Redis +import os +from PopPUNK.web import sketch_to_hdf5 +from PopPUNK.utils import setupDBFuncs, createOverallLineage +from PopPUNK.lineages import print_overall_clustering + +from beebop.poppunkWrapper import PoppunkWrapper +from beebop.filestore import PoppunkFileStore, DatabaseFileStore +from beebop.utils import hex_to_decimal + + +def get_lineages(hashes_list: list, + p_hash: str, + fs: PoppunkFileStore, + db_paths: DatabaseFileStore, + args: dict) -> dict: + """ + Assign cluster numbers to samples using PopPUNK. + + :param hashes_list: [list of file hashes from all query samples] + :param p_hash: [project_hash] + :param fs: [PoppunkFileStore with paths to input files] + :param db_paths: [DatabaseFileStore which provides paths + to database files] + :param args: [arguments for Poppunk's assign function, stored in + resources/args.json] + :return dict: [dict with filehash (key) and cluster number (value)] + """ + try: + # get results from previous job + current_job = get_current_job(Redis()) + query_strains = current_job.dependency.result + + # set output directory + outdir = fs.output(p_hash) + if not os.path.exists(outdir): + os.mkdir(outdir) + + # create qc_dict + qc_dict = {'run_qc': False} + + # create dbFuncs + dbFuncs = setupDBFuncs(args=args.assign) + + # transform json to dict + sketches_dict = {} + for hash in hashes_list: + sketches_dict[hash] = fs.input.get(hash) + + # convert hex to decimal + hex_to_decimal(sketches_dict) + + # create hdf5 db + qNames = sketch_to_hdf5(sketches_dict, outdir) + + # generate wrapper + wrapper = PoppunkWrapper(fs, db_paths, args, p_hash) + + # Read querying scheme + with open(db_paths.lineage_scheme, 'rb') as pickle_file: + ref_db, rlist, model_dir, clustering_file, \ + args.clustering_col_name, distances, kmers, sketch_sizes, \ + codon_phased, max_search_depth, rank_list, use_accessory, \ + min_count, count_unique_distances, reciprocal_only, \ + strand_preserved, core, accessory, lineage_dbs = \ + pickle.load(pickle_file) + + # add symlink: poppunks function assign_query_hdf5() takes a path as + # 'output' argument, which is where it will look for a .h5 file, but + # also writes .dists.pkl and .dists.npy to this location. When running + # this function for lineage models, we need to point it to the .h5 + # file in /poppunk_output, but using this path as 'output' results in + # the .dists files from cluster assignment being overwritten by those + # from lineage assignment. Since these are still needed for the + # visualisations, writing .dists files to another location is + # necessary. This is done by setting 'outpu5' to a subfolder of + # /poppunk_output, and to find the .h5 file this symlink is generated. + currentwd = os.getcwd() + target = str(PurePath(currentwd, fs.output(p_hash), f"{p_hash}.h5")) + link = str(PurePath(currentwd, + fs.lineages_output(p_hash), + "lineages_output.h5")) + os.makedirs(fs.lineages_output(p_hash), exist_ok=True) + os.symlink(target, link) + + overall_lineage = {} + for strain in query_strains: + if strain in lineage_dbs.keys(): + lineageClustering = wrapper.assign_lineages(dbFuncs, + qc_dict, + lineage_dbs, + strain, + query_strains, + strand_preserved, + core, + accessory) + overall_lineage[strain] = \ + createOverallLineage(rank_list, lineageClustering) + + # Print combined strain and lineage clustering + print_overall_clustering(overall_lineage, + fs.lineages_output(p_hash) + '.csv', + qNames) + + finally: + # remove output folder - otherwise rerunning the code with the same + # project hash will cause an error when generating the symlink + shutil.rmtree(fs.lineages_output(p_hash)) + + return overall_lineage diff --git a/beebop/filestore.py b/beebop/filestore.py index e7dda7cb..d7cb2031 100644 --- a/beebop/filestore.py +++ b/beebop/filestore.py @@ -1,6 +1,6 @@ import os import json -from pathlib import PurePath +from pathlib import PurePath, Path class FileStore: @@ -138,6 +138,14 @@ def network_output_csv(self, p_hash) -> str: "network", "network_cytoscape.csv")) + def lineages_csv(self, p_hash) -> str: + """ + :param p_hash: [project hash] + :return str: [path to lineages csv file] + """ + return str(PurePath(self.output(p_hash), + "lineages_output.csv")) + def network_output_component(self, p_hash, component_number) -> str: """ :param p_hash: [project hash] @@ -158,6 +166,13 @@ def network_mapping(self, p_hash) -> str: "network", 'cluster_component_dict.pickle')) + def lineages_output(self, p_hash) -> str: + """ + :param p_hash: [project_hash] + :return str: [path to lineage results] + """ + return str(PurePath(self.output(p_hash), "lineages_output")) + class DatabaseFileStore: """ @@ -174,3 +189,21 @@ def __init__(self, full_path): self.name).with_suffix('.dists.pkl')) self.previous_clustering = str(PurePath(self.db, f"{self.name}_clusters.csv")) + + self.lineage_scheme = PurePath(self.path, 'example_lineage_scheme.pkl') + + def lineage_model(self, lineage_path: str) -> str: + """ + :param lineage_path: [path to lineage database folder] + :return str: [path to lineage model directory] + """ + return './' + str(PurePath(self.path, lineage_path)) + + def lineage_distances(self, lineage_path: str) -> str: + """ + :param lineage_path: [path to lineage database folder] + :return str: [path to distances file] + """ + return str(PurePath(self.path, + lineage_path, + f"{Path(lineage_path).name}.dists")) diff --git a/beebop/poppunkWrapper.py b/beebop/poppunkWrapper.py index e516e2bd..db0282bb 100644 --- a/beebop/poppunkWrapper.py +++ b/beebop/poppunkWrapper.py @@ -1,7 +1,9 @@ from PopPUNK.assign import assign_query_hdf5 +from PopPUNK.lineages import query_db from PopPUNK.visualise import generate_visualisations from beebop.filestore import DatabaseFileStore import shutil +import os class PoppunkWrapper: @@ -25,14 +27,16 @@ def __init__(self, fs, db_paths, args, p_hash): def assign_clusters(self, dbFuncs: DatabaseFileStore, qc_dict: dict, - qNames: list) -> None: + qNames: list) -> dict: """ :param dbFuncs: [database functions, generated with poppunks setupDBFuncs()] :param qc_dict: [dict whether qc should run or not] :param qNames: [hd5 database with all sketches] + :return dict: dict of dict with cluster assignments (keys are sequence + names) """ - assign_query_hdf5( + isolateClustering = assign_query_hdf5( dbFuncs=dbFuncs, ref_db=self.db_paths.db, qNames=qNames, @@ -56,6 +60,60 @@ def assign_clusters(self, gpu_graph=self.args.assign.gpu_graph, save_partial_query_graph=self.args.assign.save_partial_query_graph ) + return isolateClustering + + def assign_lineages(self, + dbFuncs: DatabaseFileStore, + qc_dict: dict, + lineage_dbs: dict, + strain: str, + query_strains: dict, + strand_preserved: bool, + core: bool, + accessory: bool) -> dict: + """ + :param dbFuncs: [database functions, generated with poppunks + setupDBFuncs()] + :param qc_dict: [dict whether qc should run or not] + :param lineage_dbs: [dict with names of lineage db folders for all + clusters] + :param strain: [string specifying the current cluster number] + :param query_strains: [dicht listing all query sample names in current + cluster] + :param strand_preserved: [bool, giving information about sequencing of + the isolate] + :param core: [bool whether only core distances should be used] + :param accessory: [bool whether only accessory distances should be + used] + :return dict: dict of dict with cluster assignments (keys are sequence + names) + """ + model_path = self.db_paths.lineage_model(lineage_dbs[strain]) + lineage_clustering = assign_query_hdf5( + dbFuncs=dbFuncs, + ref_db=model_path, + qNames=query_strains[strain], + output=self.fs.lineages_output(self.p_hash), + qc_dict=qc_dict, + update_db=self.args.assign.update_db, + write_references=self.args.assign.write_references, + distances=self.db_paths.lineage_distances(lineage_dbs[strain]), + serial=self.args.assign.serial, + threads=self.args.assign.threads, + overwrite=self.args.assign.overwrite, + plot_fit=self.args.assign.plot_fit, + graph_weights=self.args.assign.graph_weights, + model_dir=model_path, + strand_preserved=strand_preserved, + previous_clustering=model_path, + external_clustering=self.args.assign.external_clustering, + core=core, + accessory=accessory, + gpu_dist=self.args.assign.gpu_dist, + gpu_graph=self.args.assign.gpu_graph, + save_partial_query_graph=self.args.assign.save_partial_query_graph + ) + return lineage_clustering def create_microreact(self, cluster: str) -> None: """ diff --git a/beebop/utils.py b/beebop/utils.py index 0ae80db1..dccaf1cc 100644 --- a/beebop/utils.py +++ b/beebop/utils.py @@ -15,6 +15,21 @@ ET.register_namespace('xsi', "http://www.w3.org/2001/XMLSchema-instance") +def hex_to_decimal(sketches_dict) -> None: + """ + [Converts all hexadecimal numbers in the sketches into decimal numbers. + These have been stored in hexadecimal format to not loose precision when + sending the sketches from the backend to the frontend] + + :param sketches_dict: [dictionary holding all sketches] + """ + for sample in list(sketches_dict.values()): + if type(sample['14'][0]) == str and re.match('0x.*', sample['14'][0]): + for x in range(14, 30, 3): + sample[str(x)] = list(map(lambda x: int(x, 16), + sample[str(x)])) + + def get_args() -> dict: """ [Read in fixed arguments to poppunk that are always set, or used as @@ -90,9 +105,9 @@ def delete_component_files(cluster_component_dict: dict, """ queries_clusters = [] queries_components = [] - for item in assign_result.values(): - queries_clusters.append(item['cluster']) - queries_components.append(cluster_component_dict[str(item['cluster'])]) + for item in list(assign_result.keys()): + queries_clusters.append(item) + queries_components.append(cluster_component_dict[str(item)]) components = set(queries_components) # delete redundant component files keep_filenames = list(map(lambda x: f"network_component_{x}.graphml", diff --git a/beebop/visualise.py b/beebop/visualise.py index 68e2e509..06845ee8 100644 --- a/beebop/visualise.py +++ b/beebop/visualise.py @@ -55,10 +55,8 @@ def microreact_internal(assign_result, corresponding filenames (values) of all query samples.] """ wrapper = PoppunkWrapper(fs, db_paths, args, p_hash) - queries_clusters = [] - for item in assign_result.values(): - queries_clusters.append(item['cluster']) - for cluster_no in set(queries_clusters): + queries_clusters = list(assign_result.keys()) + for cluster_no in queries_clusters: wrapper.create_microreact(cluster_no) replace_filehashes(fs.output_microreact(p_hash, cluster_no), name_mapping) diff --git a/spec/cluster.schema.json b/spec/cluster.schema.json index e81f538a..43ea457b 100644 --- a/spec/cluster.schema.json +++ b/spec/cluster.schema.json @@ -3,14 +3,9 @@ "type": "object", "patternProperties": { "[0-9]*": { - "type": "object", - "properties": { - "hash": { - "type": "string" - }, - "cluster": { - "type": ["integer", "string"] - } + "type": "array", + "items": { + "type": "string" } } } diff --git a/tests/files/poppunk_output/test_lineage_csv/lineages_output.csv b/tests/files/poppunk_output/test_lineage_csv/lineages_output.csv new file mode 100644 index 00000000..7d96efc9 --- /dev/null +++ b/tests/files/poppunk_output/test_lineage_csv/lineages_output.csv @@ -0,0 +1,3 @@ +id,Cluster,Rank_1,Rank_2,Rank_3,overall +7622_5_91,5,20,3,2,20-3-2 +6930_8_9,59,5,2,2,5-2-2 diff --git a/tests/files/poppunk_output/test_network_zip/network/cluster_component_dict.pickle b/tests/files/poppunk_output/test_network_zip/network/cluster_component_dict.pickle new file mode 100644 index 00000000..8d60c3de Binary files /dev/null and b/tests/files/poppunk_output/test_network_zip/network/cluster_component_dict.pickle differ diff --git a/tests/files/poppunk_output/test_network_zip/network/network_component_38.graphml b/tests/files/poppunk_output/test_network_zip/network/network_component_38.graphml new file mode 100644 index 00000000..a5441b83 --- /dev/null +++ b/tests/files/poppunk_output/test_network_zip/network/network_component_38.graphml @@ -0,0 +1,529 @@ + + + + + 10754X48_140620_D00294_0106_AC4FB4ANXX_5 + ref + + + 11511_7#13 + ref + + + 11511_8#9 + ref + + + 11658_8#16 + ref + + + 11860_6#17 + ref + + + 12162_1#1 + ref + + + 12162_1#83 + ref + + + 12291_1#48 + ref + + + 12291_6#55 + ref + + + 12754_4#62 + ref + + + 13154_2#84 + ref + + + 14410_7#45 + ref + + + 14412_3#9 + ref + + + 14520_7#52 + ref + + + 14723_2#34 + ref + + + 15531_4#42 + ref + + + 15841_4#22 + ref + + + 17150_4#14 + ref + + + 17150_5#80 + ref + + + 17150_6#62 + ref + + + 20402_4#127 + ref + + + 21127_1#124 + ref + + + 21127_2#143 + ref + + + 22114_6#48 + ref + + + 22335_7#141 + ref + + + 24820_2#210 + ref + + + 24977_8#65 + ref + + + 28042_2#73 + ref + + + 31958_2#295 + ref + + + 34746_4#352 + ref + + + 5381_2#1 + ref + + + 5381_2#9 + ref + + + 6259_1#1 + ref + + + 6259_1#9 + ref + + + 7553_4#6 + ref + + + 7622_5#73.fa + query + + + 7553_4#6.fa + query + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tests/test_integration.py b/tests/test_integration.py index 63ec9fb4..e6866bb3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -41,17 +41,18 @@ def test_run_poppunk(client, qtbot): # retrieve job status status = client.get("/status/" + p_hash) status_options = ['queued', 'started', 'finished', 'waiting'] - assert read_data(status)['assign'] in status_options + assert read_data(status)['assignClusters'] in status_options + assert read_data(status)['assignLineages'] in status_options assert read_data(status)['microreact'] in status_options assert read_data(status)['network'] in status_options # retrieve cluster result when finished - def assign_status_finished(): + def assignClusters_status_finished(): status = client.get("/status/" + p_hash) - assert read_data(status)['assign'] == 'finished' + assert read_data(status)['assignClusters'] == 'finished' - qtbot.waitUntil(assign_status_finished, timeout=20000) - result = client.post("/results/assign", json={ + qtbot.waitUntil(assignClusters_status_finished, timeout=200000) + result = client.post("/results/assignClusters", json={ 'projectHash': p_hash}) result_object = json.loads(result.data.decode("utf-8")) assert result_object["status"] == "success" @@ -77,6 +78,15 @@ def network_status_finished(): "/network/cluster_component_dict.pickle") +def test_results_lineage(client): + p_hash = 'test_lineage_csv' + response = client.post("/results/assignLineages", json={ + 'projectHash': p_hash, + 'cluster': 1}) + data = json.loads(response.data.decode('utf-8'))['data'] + assert {"rank1": '20', "rank2": '3', "rank3": '2'} == data['7622_5_91'] + + def test_results_microreact(client): p_hash = 'test_microreact_api' cluster = 7 @@ -102,8 +112,9 @@ def test_results_zip(client): type = 'network' response = client.post("/results/zip", json={ 'projectHash': p_hash, - 'cluster': None, + 'cluster': 1, 'type': type}) + assert 'network_component_38.graphml'.encode('utf-8') in response.data assert 'network_cytoscape.csv'.encode('utf-8') in response.data assert 'network_cytoscape.graphml'.encode('utf-8') in response.data diff --git a/tests/test_unit.py b/tests/test_unit.py index bbf7dd11..0d8a8ca8 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -13,6 +13,7 @@ import os from flask import Flask from unittest.mock import Mock, patch +from unittest import TestCase from io import BytesIO import xml.etree.ElementTree as ET @@ -20,6 +21,7 @@ from beebop import app from beebop import versions from beebop import assignClusters +from beebop import assignLineages from beebop import visualise from beebop import utils from beebop.filestore import PoppunkFileStore, FileStore, DatabaseFileStore @@ -68,16 +70,60 @@ def test_assign_clusters(): db_paths, args) expected = { - 0: {'cluster': 9, 'hash': '02ff334f17f17d775b9ecd69046ed296'}, - 1: {'cluster': 41, 'hash': '9c00583e2f24fed5e3c6baa87a4bfa4c'}, - 2: {'cluster': 10, 'hash': '99965c83b1839b25c3c27bd2910da00a'}} - assert list(result.values()) == unordered(list(expected.values())) + '9': ['02ff334f17f17d775b9ecd69046ed296'], + '41': ['9c00583e2f24fed5e3c6baa87a4bfa4c'], + '10': ['99965c83b1839b25c3c27bd2910da00a']} + assert result == expected + + +def test_assign_lineages(mocker): + def mock_get_current_job(Redis): + assign_result = {'9': ['02ff334f17f17d775b9ecd69046ed296'], + '41': ['9c00583e2f24fed5e3c6baa87a4bfa4c'], + '10': ['99965c83b1839b25c3c27bd2910da00a']} + + class mock_dependency: + def __init__(self, result): + self.result = result + + class mock_job: + def __init__(self, result): + self.dependency = mock_dependency(result) + return mock_job(assign_result) + mocker.patch( + 'beebop.assignLineages.get_current_job', + new=mock_get_current_job + ) + hashes_list = [ + '02ff334f17f17d775b9ecd69046ed296', + '9c00583e2f24fed5e3c6baa87a4bfa4c', + '99965c83b1839b25c3c27bd2910da00a'] + + result = assignLineages.get_lineages( + hashes_list, + 'unit_test_poppunk_assign', + fs, + db_paths, + args) + expected = { + 9: ['02ff334f17f17d775b9ecd69046ed296'], + 41: ['9c00583e2f24fed5e3c6baa87a4bfa4c'], + 10: ['99965c83b1839b25c3c27bd2910da00a']} + assert result['9']['Rank_1']['02ff334f17f17d775b9ecd69046ed296'] == 13 + assert result['41']['Rank_1']['9c00583e2f24fed5e3c6baa87a4bfa4c'] == 2 + assert result['10']['Rank_1']['99965c83b1839b25c3c27bd2910da00a'] == 39 + assert result['9']['Rank_2']['02ff334f17f17d775b9ecd69046ed296'] == 1 + assert result['41']['Rank_2']['9c00583e2f24fed5e3c6baa87a4bfa4c'] == 1 + assert result['10']['Rank_2']['99965c83b1839b25c3c27bd2910da00a'] == 4 + assert result['9']['Rank_3']['02ff334f17f17d775b9ecd69046ed296'] == 1 + assert result['41']['Rank_3']['9c00583e2f24fed5e3c6baa87a4bfa4c'] == 1 + assert result['10']['Rank_3']['99965c83b1839b25c3c27bd2910da00a'] == 2 def test_microreact(mocker): def mock_get_current_job(Redis): - assign_result = {0: {'cluster': 5, 'hash': 'some_hash'}, - 1: {'cluster': 59, 'hash': 'another_hash'}} + assign_result = {5: ['some_hash'], + 59: ['another_hash']} class mock_dependency: def __init__(self, result): @@ -102,8 +148,8 @@ def __init__(self, result): def test_microreact_internal(): - assign_result = {0: {'cluster': 5, 'hash': 'some_hash'}, - 1: {'cluster': 59, 'hash': 'another_hash'}} + assign_result = {5: ['some_hash'], + 59: ['another_hash']} p_hash = 'unit_test_visualisations' name_mapping = { "hash1": "name1.fa", @@ -117,8 +163,8 @@ def test_microreact_internal(): def test_network(mocker): def mock_get_current_job(Redis): - assign_result = {0: {'cluster': 5, 'hash': 'some_hash'}, - 1: {'cluster': 59, 'hash': 'another_hash'}} + assign_result = {5: ['some_hash'], + 59: ['another_hash']} class mock_dependency: def __init__(self, result): @@ -144,8 +190,8 @@ def __init__(self, result): def test_network_internal(): - assign_result = {0: {'cluster': 5, 'hash': 'some_hash'}, - 1: {'cluster': 59, 'hash': 'another_hash'}} + assign_result = {5: ['some_hash'], + 59: ['another_hash']} p_hash = 'unit_test_visualisations' name_mapping = { "hash1": "name1.fa", @@ -190,19 +236,25 @@ def test_run_poppunk_internal(qtbot): # submits assign job to queue worker = SimpleWorker([queue], connection=queue.connection) worker.work(burst=True) # Runs enqueued job - job_assign = Job.fetch(job_ids["assign"], connection=redis) + job_assign_clusters = Job.fetch(job_ids["assign_clusters"], + connection=redis) status_options = ['queued', 'started', 'finished', 'scheduled'] - assert job_assign.get_status() in status_options + assert job_assign_clusters.get_status() in status_options # saves p-hash with job id in redis - assert read_redis("beebop:hash:job:assign", - project_hash, redis) == job_ids["assign"] + assert read_redis("beebop:hash:job:assignClusters", + project_hash, redis) == job_ids["assign_clusters"] # wait for assign job to be finished - def assign_status_finished(): - job = Job.fetch(job_ids["assign"], connection=redis) + def assign_clusters_status_finished(): + job = Job.fetch(job_ids["assign_clusters"], connection=redis) assert job.get_status() == 'finished' - qtbot.waitUntil(assign_status_finished, timeout=20000) - # submits visualisation jobs to queue + qtbot.waitUntil(assign_clusters_status_finished, timeout=200000) + # submits lineage assignment and visualisation jobs to queue + job_lineage = Job.fetch(job_ids["assign_lineages"], connection=redis) + assert job_lineage.get_status() in status_options + assert read_redis("beebop:hash:job:assignLineages", + project_hash, + redis) == job_ids["assign_lineages"] job_microreact = Job.fetch(job_ids["microreact"], connection=redis) assert job_microreact.get_status() in status_options assert read_redis("beebop:hash:job:microreact", @@ -220,7 +272,7 @@ def test_get_clusters_internal(client): q = Queue(connection=redis) job = q.enqueue(dummy_fct, 5) hash = "unit_test_get_clusters_internal" - redis.hset("beebop:hash:job:assign", hash, job.id) + redis.hset("beebop:hash:job:assignClusters", hash, job.id) result1 = app.get_clusters_internal(hash, redis) assert read_data(result1[0])['error'] == { "status": "failure", @@ -234,6 +286,8 @@ def test_get_clusters_internal(client): while finished is False: time.sleep(1) result2 = app.get_clusters_internal(hash, redis) + if type(result2) is tuple: + continue if read_data(result2)['status'] == 'success': finished = True assert read_data(result2) == { @@ -253,19 +307,22 @@ def test_get_status_internal(client): # queue example job redis = Redis() q = Queue(connection=Redis()) - job_assign = q.enqueue(dummy_fct, 1) + job_assign_clusters = q.enqueue(dummy_fct, 1) + job_assign_lineages = q.enqueue(dummy_fct, 1) job_microreact = q.enqueue(dummy_fct, 1) job_network = q.enqueue(dummy_fct, 1) worker = SimpleWorker([q], connection=q.connection) worker.work(burst=True) hash = "unit_test_get_status_internal" - redis.hset("beebop:hash:job:assign", hash, job_assign.id) + redis.hset("beebop:hash:job:assignClusters", hash, job_assign_clusters.id) + redis.hset("beebop:hash:job:assignLineages", hash, job_assign_lineages.id) redis.hset("beebop:hash:job:microreact", hash, job_microreact.id) redis.hset("beebop:hash:job:network", hash, job_network.id) result = app.get_status_internal(hash, redis) assert read_data(result)['status'] == 'success' status_options = ['queued', 'started', 'finished', 'scheduled', 'waiting'] - assert read_data(result)['data']['assign'] in status_options + assert read_data(result)['data']['assignClusters'] in status_options + assert read_data(result)['data']['assignLineages'] in status_options assert read_data(result)['data']['microreact'] in status_options assert read_data(result)['data']['network'] in status_options assert read_data(app.get_status_internal("wrong-hash", @@ -386,7 +443,7 @@ def test_send_zip_internal(client): assert filename1.encode('utf-8') in response.data assert filename2.encode('utf-8') in response.data project_hash = 'test_network_zip' - cluster = None + cluster = 1 type = 'network' response = app.send_zip_internal(project_hash, type, @@ -395,6 +452,7 @@ def test_send_zip_internal(client): response.direct_passthrough = False assert 'network_cytoscape.csv'.encode('utf-8') in response.data assert 'network_cytoscape.graphml'.encode('utf-8') in response.data + assert 'network_component_38.graphml'.encode('utf-8') in response.data def test_download_graphml_internal(): @@ -416,6 +474,19 @@ def test_download_graphml_internal(): assert error2['error'] == 'File not found' +def test_get_lineages_internal(): + project_hash = 'test_lineage_csv' + response = app.get_lineages_internal(project_hash, + storage_location) + data = read_data(response)['data'] + assert {"rank1": '20', "rank2": '3', "rank3": '2'} == data['7622_5_91'] + hash_no_lineage_file = 'test microreact_api' + response_error2 = app.get_lineages_internal(hash_no_lineage_file, + storage_location) + error2 = read_data(response_error2[0])['error']['errors'][0] + assert error2['error'] == 'File not found' + + def test_hex_to_decimal(): dummy_sketch = { "sample1": { @@ -516,8 +587,8 @@ def test_delete_component_files(): '12': '3' } assign_result = { - 0: {'cluster': '4'}, - 1: {'cluster': '5'} + 4: ['hash'], + 5: ['hash2'] } p_hash = 'results_modifications' utils.delete_component_files(cluster_component_dict,