presumably completely runnable pipeline

marbl · May 31, 2024 · f6da468 · f6da468
1 parent ed4893b
commit f6da468
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 225 deletions.
diff --git a/src/scripts/rdna_scaff.py b/src/scripts/rdna_scaff.py
@@ -59,223 +59,6 @@
 rukki_paths = sf.read_rukki_paths(old_rukki_tsv_file, G)
 gf.load_indirect_graph(gfa_file, indirectG)
 #sf.try_to_scaff(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "unitigs.matches"), G, indirectG, uncompressed_nodes)
-sg = scaffold_graph.ScaffoldGraph(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread_upd.output"), os.path.join(hicrun_dir, "mashmap_nonhpc50.out"), G, uncompressed_nodes, logger) 
+sg = scaffold_graph.ScaffoldGraph(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "mashmap_nonhpc50.out"), G, uncompressed_nodes, logger) 
 res = sg.generateScaffolds()
 
-exit()
-
-#read graph
-#add_telomeric_nodes
-#construct new graph (vertex = start/end of edge), ignoring overlaps
-#dijkstra 
-#condition 
-
-#not expexted to be run in main pipeline
-if debug and os.path.exists(rdna_file): 
-    print("Somehow you are deeply in verkko's debug, is it intentional?")
-    rdna_nodes_mashmap = sf.get_rdna_ribotin(hicrun_dir)
-    rdna_nodes2= sf.get_rdna_mashmap(hicrun_dir)
-    print ("Ribo non mashmap")
-    print(rdna_nodes_mashmap - rdna_nodes2)
-    print ("Mashmap non ribo")
-    print(rdna_nodes2 - rdna_nodes_mashmap)
-
-rdna_nodes = sf.get_rdna_mashmap(hicrun_dir)
-
-
-G = nx.DiGraph()
-#graph_f fails?
-#TODO should use graph_functions!
-gf.load_direct_graph(gfa_file, G)
-#double distance between middle of nodes
-dists = dict(nx.all_pairs_dijkstra_path_length(G, weight='mid_length'))
-paths = sf.read_rukki_paths(rukki_tsv_file, G)
-
-short_arm_paths_ids = sf.get_arm_paths_ids(telomere_locations_file, G, paths, rdna_nodes, 100000, 5000000, False)
-experimental_long_arm_path_ids = sf.get_arm_paths_ids(telomere_locations_file, G, paths, rdna_nodes, 10000000, 999999999, True)
-same_component_long_arm_path_ids = sf.get_same_component_paths(short_arm_paths_ids, G, paths, 10000000,999999999)
-for idd in same_component_long_arm_path_ids.keys():
-    if idd not in experimental_long_arm_path_ids:
-        experimental_long_arm_path_ids[idd] = same_component_long_arm_path_ids[idd]
-        print (f"Component-based added path {idd}")
-print (f"short arms PATH number{len(short_arm_paths_ids)}")
-for idd in short_arm_paths_ids.keys():
-    print(paths.getPathById(idd))
-
-print (short_arm_paths_ids)
-#parse translation_hap files
-
-#getting long arms
-#haplotype2-0000054      chr22   45121307        51324926
-#chr names in general case not available
-
-#Should not be run in normal runs
-if debug and os.path.exists(translation_hap1_file) and os.path.exists(translation_paths_file):
-    print("Somehow you are deeply in verkko's debug, is it intentional?")
-    long_arm_paths = {}
-    multiplicities = {}
-    acros = {"chr13", "chr14", "chr15", "chr21", "chr22"}
-    long_arms = {}
-    for i in range (1, 3):
-        trans_file = os.path.join (hicrun_dir, os.pardir, "translation_hap" + str(i))
-        for line in open(trans_file):
-            [id, chr_num, plen, chr_len] = line.split()
-            if chr_num in acros and int(plen) < int(chr_len):
-                long_arm = chr_num + "_" + str(i)
-                #let's use longest assignment that is shorter than fixed known chr length
-                if long_arm in long_arms and long_arms[long_arm][1]> int(plen):
-                    continue
-                long_arms[long_arm] =[id, int(plen)]
-    chr_to_rukkiid = {}
-    rukkiid_to_name = {}
-    for line in open(translation_paths_file):
-        arr = line.strip().split()
-        if len(arr) == 3 and arr[0] == "path":
-            chr_to_rukkiid[arr[1]] = arr[2]     
-            rukkiid_to_name[arr[2]] = arr[1]
-
-    print ("long arm rukki ids:")
-    mapping_based_long_arm_path_ids = set()
-    for long_arm in sorted(long_arms.keys()):
-        mapping_based_long_arm_path_ids.add(chr_to_rukkiid[long_arms[long_arm][0]])
-    print ("experimental but not mapping based:")
-    print (experimental_long_arm_path_ids.keys() - mapping_based_long_arm_path_ids)
-
-    print ("Mapping based but not experimental based:")
-    print (mapping_based_long_arm_path_ids - experimental_long_arm_path_ids.keys())
-
-
-long_arm_paths = {}
-for long_arm in sorted(experimental_long_arm_path_ids.keys()):
-    long_arm_paths[long_arm] = paths.getPathById(long_arm)
-print (f"Total long arms found {len(experimental_long_arm_path_ids.keys())}")
-
-
-multiplicities = sf.get_multiplicities(paths)
-
-
-
-components = {}
-component_id = 0
-node_to_component = {}
-for line in open(hicverkko_log):
-    arr = line.strip().split()
-    if arr[0] == "Connected":
-        component_id += 1
-        utigs = re.findall(r"'(utig\d+-\d+)'", line)
-        for utig in utigs:
-            node_to_component[utig] = component_id
-        components[component_id] = utigs
-
-longarm_to_component = {}
-longarm_component_nodes = {}
-component_to_longarm = {}
-for long_arm in sorted(experimental_long_arm_path_ids.keys()):
-    path = long_arm_paths[long_arm]
-    print (f"long arm {long_arm} path {path}")
-    comp_w = {}
-    for component_id in components.keys():
-        comp_w[component_id] = 0
-    for node in path:
-        if node in node_to_component:
-            comp_w[node_to_component[node]] += G.nodes[node+'+']['length']
-    majority_component = max(comp_w, key=comp_w.get)
-    longarm_to_component[long_arm] = majority_component
-    if not majority_component in component_to_longarm:
-        component_to_longarm[majority_component] = []
-        longarm_component_nodes[majority_component] = []
-    component_to_longarm[majority_component].append(long_arm)
-    longarm_component_nodes[majority_component].extend(long_arm_paths[long_arm])
-
-print("long arm components")
-print(longarm_to_component)    
-
-
-weights_map = {}
-for line in open (hic_byread):
-    arr = line.strip().split()
-    if not arr[1]+'+' in weights_map:
-        for dir in ["+", "-"]:
-            weights_map[arr[1] + dir] = {}
-    if not arr[2]+'+' in weights_map:
-        for dir in ["+", "-"]:
-            weights_map[arr[2] + dir] = {}
-    for dir1 in ["+", "-"]:
-        for dir2 in ["+", "-"]:
-            weights_map[arr[1]+dir1][arr[2] + dir2] =  int(arr[3])
-            weights_map[arr[2]+dir1][arr[1] +dir2] =  int(arr[3])
-
-#path scoring
-final_assignment = {}
-
-tsv_out = {}
-#if one long assigned to multiple short - do not change any and report weirdness
-long_arm_multiplicities = {}
-for short_id in short_arm_paths_ids.keys():
-    short_arm_nodes = paths.getPathById(short_id)
-    #    print (weights_map[short_id])
-    max_score = 0
-    max_long_arm = "none"
-    best_path = sf.get_best_path(short_id, long_arm_paths, paths, longarm_to_component, multiplicities, weights_map, G)
-    final_assignment[short_id] = best_path
-    if not best_path in long_arm_multiplicities:
-        long_arm_multiplicities[best_path] = 0
-    long_arm_multiplicities[best_path] += 1
-for short_id in final_assignment.keys():
-    best_path = final_assignment[short_id]
-    if best_path != "Unclear":
-        if long_arm_multiplicities[best_path] > 1:
-            error_str = f"Multiple short arms assigned to one long arm {best_path}, not scaffolding!"
-            for id in final_assignment.keys():
-                if final_assignment[id] == best_path:
-                    error_str += f" {id}"
-                    final_assignment[id] = "Unclear"
-            print (error_str)
-        else:
-            to_output = []
-            if experimental_long_arm_path_ids[best_path]:
-                to_output.extend(paths.getPathById(best_path))
-            else:
-                to_output.extend(sf.rc_path(paths.getPathById(best_path)))
-            to_output.append("[N1000001N:rdna]")
-            if short_arm_paths_ids[short_id]:
-                to_output.extend(sf.rc_path(paths.getPathById(short_id)))
-            else:
-                to_output.extend(paths.getPathById(short_id))
-            res_path = ','.join(to_output)
-            tsv_out[best_path] = res_path
-            tsv_out[short_id] = "NONE"
-
-with open(scaff_rukki_tsv_file, "w") as out_tsv:        
-    for line in open(rukki_tsv_file):
-        arr = line.strip().split()
-        if arr[0] in tsv_out:
-            if tsv_out[arr[0]] == "NONE":
-                continue
-            else: 
-                out_tsv.write(arr[0] + "\t" + tsv_out[arr[0]] + "\t" + arr[2] + "\n")
-        else:
-            out_tsv.write(line)
-with open(scaff_rukki_gaf_file, "w") as out_gaf:        
-    for line in open(scaff_rukki_tsv_file):
-        arr = line.strip().split()
-        out_gaf.write(arr[0] + "\t" + gf.tsv2gaf(arr[1]) + "\t" + arr[2] + "\n")
-
-#Additional output for debugging only
-scores = {}
-assgnd = 0
-for short_id in short_arm_paths_ids.keys():
-    print (f"Assigned PATH: {short_id} --- {final_assignment[short_id]}")
-    if final_assignment[short_id] != "Unclear":
-       assgnd += 1
-    scores[short_id] = {}
-    for long_arm in long_arm_paths.keys():
-        scores[short_id][long_arm] = sf.scorepath(paths.getPathById(short_id), long_arm_paths[long_arm], multiplicities, weights_map, True)
-    print (f"All scores path using homozygous: {short_id} --- {scores[short_id]}\n")
-
-    scores[short_id] = {}
-    for long_arm in long_arm_paths.keys():
-        scores[short_id][long_arm] = sf.scorepath(paths.getPathById(short_id), long_arm_paths[long_arm], multiplicities, weights_map, False)
-    print (f"All scores NOT using homozygous: {short_id} --- {scores[short_id]}\n")
-
-print (f"total paths assigned {assgnd} of {len(short_arm_paths_ids)}")
diff --git a/src/scripts/scaffolding/scaffold_graph.py b/src/scripts/scaffolding/scaffold_graph.py
@@ -166,17 +166,20 @@ def nodeToPathDist(self, node, path, check_homologous):
         closest = 1000000000
         add_dist = 0
         for path_node in path:
+            #self.logger.debug (f"Checking nodepair dist from {node} to {path_node}")
             if path_node in self.upd_G.nodes:
                 if path_node in self.dists[node]:
                     if path_node == node:
                         closest = 0
-                    closest = min(closest, self.dists[node][path_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[path_node.strip("-+")])
-                    if check_homologous:
-                        for hom_node in self.homologousOrNodes(path_node):
-                            if hom_node == node:
-                                closest = 0
-                            if hom_node in self.dists[node]:
-                               closest = min(closest, self.dists[node][hom_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[hom_node.strip("-+")])
+                    else:
+                        closest = min(closest, self.dists[node][path_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[path_node.strip("-+")])
+                        if check_homologous:
+                            for hom_node in self.homologousOrNodes(path_node):
+                                #self.logger.debug (f"Checking homologous nodepair dist from {node} to {hom_node}")
+                                if hom_node == node:
+                                    closest = 0
+                                elif hom_node in self.dists[node]:
+                                    closest = min(closest, self.dists[node][hom_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[hom_node.strip("-+")])
                 add_dist += 2* self.compressed_lens[path_node.strip("-+")]
         return closest/2
     #Optionally allowing to use homologous nodes (to improve in gaps)
@@ -189,6 +192,7 @@ def pathDist(self, path_from, path_to, check_homologous):
                 closest = min(closest, self.nodeToPathDist(node, path_to, check_homologous) + add_dist)
                 if check_homologous:
                     for hom_node in self.homologousOrNodes(node) :
+                        #self.logger.debug (f"Checking homologous dist from {hom_node} to {path_to} add_dist {add_dist}")
                         closest = min(closest, (self.nodeToPathDist(hom_node, path_to, check_homologous) + add_dist))
                 add_dist += 2* self.compressed_lens[node.strip("-+")]
         return closest