diff --git a/src/scripts/rdna_scaff.py b/src/scripts/rdna_scaff.py index fc2bdc9a..0cd45450 100755 --- a/src/scripts/rdna_scaff.py +++ b/src/scripts/rdna_scaff.py @@ -59,223 +59,6 @@ rukki_paths = sf.read_rukki_paths(old_rukki_tsv_file, G) gf.load_indirect_graph(gfa_file, indirectG) #sf.try_to_scaff(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "unitigs.matches"), G, indirectG, uncompressed_nodes) -sg = scaffold_graph.ScaffoldGraph(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread_upd.output"), os.path.join(hicrun_dir, "mashmap_nonhpc50.out"), G, uncompressed_nodes, logger) +sg = scaffold_graph.ScaffoldGraph(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "mashmap_nonhpc50.out"), G, uncompressed_nodes, logger) res = sg.generateScaffolds() -exit() - -#read graph -#add_telomeric_nodes -#construct new graph (vertex = start/end of edge), ignoring overlaps -#dijkstra -#condition - -#not expexted to be run in main pipeline -if debug and os.path.exists(rdna_file): - print("Somehow you are deeply in verkko's debug, is it intentional?") - rdna_nodes_mashmap = sf.get_rdna_ribotin(hicrun_dir) - rdna_nodes2= sf.get_rdna_mashmap(hicrun_dir) - print ("Ribo non mashmap") - print(rdna_nodes_mashmap - rdna_nodes2) - print ("Mashmap non ribo") - print(rdna_nodes2 - rdna_nodes_mashmap) - -rdna_nodes = sf.get_rdna_mashmap(hicrun_dir) - - -G = nx.DiGraph() -#graph_f fails? -#TODO should use graph_functions! -gf.load_direct_graph(gfa_file, G) -#double distance between middle of nodes -dists = dict(nx.all_pairs_dijkstra_path_length(G, weight='mid_length')) -paths = sf.read_rukki_paths(rukki_tsv_file, G) - -short_arm_paths_ids = sf.get_arm_paths_ids(telomere_locations_file, G, paths, rdna_nodes, 100000, 5000000, False) -experimental_long_arm_path_ids = sf.get_arm_paths_ids(telomere_locations_file, G, paths, rdna_nodes, 10000000, 999999999, True) -same_component_long_arm_path_ids = sf.get_same_component_paths(short_arm_paths_ids, G, paths, 10000000,999999999) -for idd in same_component_long_arm_path_ids.keys(): - if idd not in experimental_long_arm_path_ids: - experimental_long_arm_path_ids[idd] = same_component_long_arm_path_ids[idd] - print (f"Component-based added path {idd}") -print (f"short arms PATH number{len(short_arm_paths_ids)}") -for idd in short_arm_paths_ids.keys(): - print(paths.getPathById(idd)) - -print (short_arm_paths_ids) -#parse translation_hap files - -#getting long arms -#haplotype2-0000054 chr22 45121307 51324926 -#chr names in general case not available - -#Should not be run in normal runs -if debug and os.path.exists(translation_hap1_file) and os.path.exists(translation_paths_file): - print("Somehow you are deeply in verkko's debug, is it intentional?") - long_arm_paths = {} - multiplicities = {} - acros = {"chr13", "chr14", "chr15", "chr21", "chr22"} - long_arms = {} - for i in range (1, 3): - trans_file = os.path.join (hicrun_dir, os.pardir, "translation_hap" + str(i)) - for line in open(trans_file): - [id, chr_num, plen, chr_len] = line.split() - if chr_num in acros and int(plen) < int(chr_len): - long_arm = chr_num + "_" + str(i) - #let's use longest assignment that is shorter than fixed known chr length - if long_arm in long_arms and long_arms[long_arm][1]> int(plen): - continue - long_arms[long_arm] =[id, int(plen)] - chr_to_rukkiid = {} - rukkiid_to_name = {} - for line in open(translation_paths_file): - arr = line.strip().split() - if len(arr) == 3 and arr[0] == "path": - chr_to_rukkiid[arr[1]] = arr[2] - rukkiid_to_name[arr[2]] = arr[1] - - print ("long arm rukki ids:") - mapping_based_long_arm_path_ids = set() - for long_arm in sorted(long_arms.keys()): - mapping_based_long_arm_path_ids.add(chr_to_rukkiid[long_arms[long_arm][0]]) - print ("experimental but not mapping based:") - print (experimental_long_arm_path_ids.keys() - mapping_based_long_arm_path_ids) - - print ("Mapping based but not experimental based:") - print (mapping_based_long_arm_path_ids - experimental_long_arm_path_ids.keys()) - - -long_arm_paths = {} -for long_arm in sorted(experimental_long_arm_path_ids.keys()): - long_arm_paths[long_arm] = paths.getPathById(long_arm) -print (f"Total long arms found {len(experimental_long_arm_path_ids.keys())}") - - -multiplicities = sf.get_multiplicities(paths) - - - -components = {} -component_id = 0 -node_to_component = {} -for line in open(hicverkko_log): - arr = line.strip().split() - if arr[0] == "Connected": - component_id += 1 - utigs = re.findall(r"'(utig\d+-\d+)'", line) - for utig in utigs: - node_to_component[utig] = component_id - components[component_id] = utigs - -longarm_to_component = {} -longarm_component_nodes = {} -component_to_longarm = {} -for long_arm in sorted(experimental_long_arm_path_ids.keys()): - path = long_arm_paths[long_arm] - print (f"long arm {long_arm} path {path}") - comp_w = {} - for component_id in components.keys(): - comp_w[component_id] = 0 - for node in path: - if node in node_to_component: - comp_w[node_to_component[node]] += G.nodes[node+'+']['length'] - majority_component = max(comp_w, key=comp_w.get) - longarm_to_component[long_arm] = majority_component - if not majority_component in component_to_longarm: - component_to_longarm[majority_component] = [] - longarm_component_nodes[majority_component] = [] - component_to_longarm[majority_component].append(long_arm) - longarm_component_nodes[majority_component].extend(long_arm_paths[long_arm]) - -print("long arm components") -print(longarm_to_component) - - -weights_map = {} -for line in open (hic_byread): - arr = line.strip().split() - if not arr[1]+'+' in weights_map: - for dir in ["+", "-"]: - weights_map[arr[1] + dir] = {} - if not arr[2]+'+' in weights_map: - for dir in ["+", "-"]: - weights_map[arr[2] + dir] = {} - for dir1 in ["+", "-"]: - for dir2 in ["+", "-"]: - weights_map[arr[1]+dir1][arr[2] + dir2] = int(arr[3]) - weights_map[arr[2]+dir1][arr[1] +dir2] = int(arr[3]) - -#path scoring -final_assignment = {} - -tsv_out = {} -#if one long assigned to multiple short - do not change any and report weirdness -long_arm_multiplicities = {} -for short_id in short_arm_paths_ids.keys(): - short_arm_nodes = paths.getPathById(short_id) - # print (weights_map[short_id]) - max_score = 0 - max_long_arm = "none" - best_path = sf.get_best_path(short_id, long_arm_paths, paths, longarm_to_component, multiplicities, weights_map, G) - final_assignment[short_id] = best_path - if not best_path in long_arm_multiplicities: - long_arm_multiplicities[best_path] = 0 - long_arm_multiplicities[best_path] += 1 -for short_id in final_assignment.keys(): - best_path = final_assignment[short_id] - if best_path != "Unclear": - if long_arm_multiplicities[best_path] > 1: - error_str = f"Multiple short arms assigned to one long arm {best_path}, not scaffolding!" - for id in final_assignment.keys(): - if final_assignment[id] == best_path: - error_str += f" {id}" - final_assignment[id] = "Unclear" - print (error_str) - else: - to_output = [] - if experimental_long_arm_path_ids[best_path]: - to_output.extend(paths.getPathById(best_path)) - else: - to_output.extend(sf.rc_path(paths.getPathById(best_path))) - to_output.append("[N1000001N:rdna]") - if short_arm_paths_ids[short_id]: - to_output.extend(sf.rc_path(paths.getPathById(short_id))) - else: - to_output.extend(paths.getPathById(short_id)) - res_path = ','.join(to_output) - tsv_out[best_path] = res_path - tsv_out[short_id] = "NONE" - -with open(scaff_rukki_tsv_file, "w") as out_tsv: - for line in open(rukki_tsv_file): - arr = line.strip().split() - if arr[0] in tsv_out: - if tsv_out[arr[0]] == "NONE": - continue - else: - out_tsv.write(arr[0] + "\t" + tsv_out[arr[0]] + "\t" + arr[2] + "\n") - else: - out_tsv.write(line) -with open(scaff_rukki_gaf_file, "w") as out_gaf: - for line in open(scaff_rukki_tsv_file): - arr = line.strip().split() - out_gaf.write(arr[0] + "\t" + gf.tsv2gaf(arr[1]) + "\t" + arr[2] + "\n") - -#Additional output for debugging only -scores = {} -assgnd = 0 -for short_id in short_arm_paths_ids.keys(): - print (f"Assigned PATH: {short_id} --- {final_assignment[short_id]}") - if final_assignment[short_id] != "Unclear": - assgnd += 1 - scores[short_id] = {} - for long_arm in long_arm_paths.keys(): - scores[short_id][long_arm] = sf.scorepath(paths.getPathById(short_id), long_arm_paths[long_arm], multiplicities, weights_map, True) - print (f"All scores path using homozygous: {short_id} --- {scores[short_id]}\n") - - scores[short_id] = {} - for long_arm in long_arm_paths.keys(): - scores[short_id][long_arm] = sf.scorepath(paths.getPathById(short_id), long_arm_paths[long_arm], multiplicities, weights_map, False) - print (f"All scores NOT using homozygous: {short_id} --- {scores[short_id]}\n") - -print (f"total paths assigned {assgnd} of {len(short_arm_paths_ids)}") diff --git a/src/scripts/scaffolding/scaffold_graph.py b/src/scripts/scaffolding/scaffold_graph.py index ccda6fb1..a7d16c85 100755 --- a/src/scripts/scaffolding/scaffold_graph.py +++ b/src/scripts/scaffolding/scaffold_graph.py @@ -166,17 +166,20 @@ def nodeToPathDist(self, node, path, check_homologous): closest = 1000000000 add_dist = 0 for path_node in path: + #self.logger.debug (f"Checking nodepair dist from {node} to {path_node}") if path_node in self.upd_G.nodes: if path_node in self.dists[node]: if path_node == node: closest = 0 - closest = min(closest, self.dists[node][path_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[path_node.strip("-+")]) - if check_homologous: - for hom_node in self.homologousOrNodes(path_node): - if hom_node == node: - closest = 0 - if hom_node in self.dists[node]: - closest = min(closest, self.dists[node][hom_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[hom_node.strip("-+")]) + else: + closest = min(closest, self.dists[node][path_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[path_node.strip("-+")]) + if check_homologous: + for hom_node in self.homologousOrNodes(path_node): + #self.logger.debug (f"Checking homologous nodepair dist from {node} to {hom_node}") + if hom_node == node: + closest = 0 + elif hom_node in self.dists[node]: + closest = min(closest, self.dists[node][hom_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[hom_node.strip("-+")]) add_dist += 2* self.compressed_lens[path_node.strip("-+")] return closest/2 #Optionally allowing to use homologous nodes (to improve in gaps) @@ -189,6 +192,7 @@ def pathDist(self, path_from, path_to, check_homologous): closest = min(closest, self.nodeToPathDist(node, path_to, check_homologous) + add_dist) if check_homologous: for hom_node in self.homologousOrNodes(node) : + #self.logger.debug (f"Checking homologous dist from {hom_node} to {path_to} add_dist {add_dist}") closest = min(closest, (self.nodeToPathDist(hom_node, path_to, check_homologous) + add_dist)) add_dist += 2* self.compressed_lens[node.strip("-+")] return closest