From cbf6da4e78b22bec8a1d14b77adde137e2ae2b66 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 20 May 2024 15:35:29 -0400 Subject: [PATCH] crash fixes --- src/Makefile | 7 +++++++ src/Snakefiles/8-hicPipeline.sm | 2 +- src/scripts/cluster.py | 11 +++++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/Makefile b/src/Makefile index 9be46d85..70008ba2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -814,6 +814,9 @@ all: $(addprefix ${TARGET_DIR}/,${ALL_TGTS}) \ ../lib/verkko/scripts/graph_functions.py \ ../lib/verkko/scripts/rdna_scaff_functions.py \ ../lib/verkko/scripts/rdna_scaff.py \ + ../lib/verkko/scripts/scaffolding/logger_wrap.py \ + ../lib/verkko/scripts/scaffolding/match_graph.py \ + ../lib/verkko/scripts/scaffolding/scaffold_graph.py \ \ ../lib/verkko/Snakefile \ ../lib/verkko/Snakefiles/1-buildGraph.sm \ @@ -900,6 +903,10 @@ $(foreach TGT,${ALL_TGTS},\ @mkdir -p ../lib/verkko/data cp -pf $< $@ +../lib/verkko/scripts/scaffolding/%: scripts/scaffolding/% + @mkdir -p ../lib/verkko/scripts/scaffolding + cp -pf $< $@ + ../lib/verkko/scripts/%: scripts/% @mkdir -p ../lib/verkko/scripts cp -pf $< $@ diff --git a/src/Snakefiles/8-hicPipeline.sm b/src/Snakefiles/8-hicPipeline.sm index c61fe1ff..a3212331 100644 --- a/src/Snakefiles/8-hicPipeline.sm +++ b/src/Snakefiles/8-hicPipeline.sm @@ -369,7 +369,7 @@ rule transformBWA: input: bwa_mapping = '8-hicPipeline/hic_to_assembly.sorted_by_read.bam' output: - byread_mapping = '8-hicPipeline/hic_mapping.byread.output' + byread_mapping = '8-hicPipeline/hic_mapping.byread.output', byread_mapping_all = '8-hicPipeline/hic_mapping_all.byread.output' log: diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index dc3f5e49..791d0ba9 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -6,6 +6,7 @@ import graph_functions import copy from networkx.algorithms import community +from scaffolding import match_graph, logger_wrap def check_non_empty(part, G): for p in part: @@ -180,6 +181,8 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only MIN_RDNA_COMPONENT = 500000 + logger = logger_wrap.initLogger("phasing.log") + # load the assembly gfa G = nx.Graph() logging_f = open (os.path.join(output_dir, LOGGING_FILENAME), 'w') @@ -287,7 +290,11 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une #Adding link between matched edges to include separated sequence to main component - matchGraph = graph_functions.loadMatchGraph(mashmap_sim, G, -10*FIXED_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT) + #TODO: only one of those should be used + mg = match_graph.MatchGraph(mashmap_sim, G, -10*FIXED_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT, logger) + matchGraph = mg.getMatchGraph() + +# matchGraph = graph_functions.loadMatchGraph(mashmap_sim, G, -10*FIXED_WEIGHT, CLEAR_HOMOLOGY, MIN_ALIGNMENT) component_colors = graph_functions.getComponentColors(G) #reconnecting homologous nodes @@ -314,7 +321,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une total_l = 0 for n in current_component: total_l += G.nodes[n]['length'] - if total_l > 1000000 and not matchGraph.isDiploid(current_component): + if total_l > 1000000 and not mg.isDiploid(current_component): logging_f.write(f"Component is not diploid!\n")