Skip to content

Commit

Permalink
presumably completely runnable pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitry-Antipov committed May 31, 2024
1 parent ed4893b commit f6da468
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 225 deletions.
219 changes: 1 addition & 218 deletions src/scripts/rdna_scaff.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,223 +59,6 @@
rukki_paths = sf.read_rukki_paths(old_rukki_tsv_file, G)
gf.load_indirect_graph(gfa_file, indirectG)
#sf.try_to_scaff(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "unitigs.matches"), G, indirectG, uncompressed_nodes)
sg = scaffold_graph.ScaffoldGraph(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread_upd.output"), os.path.join(hicrun_dir, "mashmap_nonhpc50.out"), G, uncompressed_nodes, logger)
sg = scaffold_graph.ScaffoldGraph(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "mashmap_nonhpc50.out"), G, uncompressed_nodes, logger)
res = sg.generateScaffolds()

exit()

#read graph
#add_telomeric_nodes
#construct new graph (vertex = start/end of edge), ignoring overlaps
#dijkstra
#condition

#not expexted to be run in main pipeline
if debug and os.path.exists(rdna_file):
print("Somehow you are deeply in verkko's debug, is it intentional?")
rdna_nodes_mashmap = sf.get_rdna_ribotin(hicrun_dir)
rdna_nodes2= sf.get_rdna_mashmap(hicrun_dir)
print ("Ribo non mashmap")
print(rdna_nodes_mashmap - rdna_nodes2)
print ("Mashmap non ribo")
print(rdna_nodes2 - rdna_nodes_mashmap)

rdna_nodes = sf.get_rdna_mashmap(hicrun_dir)


G = nx.DiGraph()
#graph_f fails?
#TODO should use graph_functions!
gf.load_direct_graph(gfa_file, G)
#double distance between middle of nodes
dists = dict(nx.all_pairs_dijkstra_path_length(G, weight='mid_length'))
paths = sf.read_rukki_paths(rukki_tsv_file, G)

short_arm_paths_ids = sf.get_arm_paths_ids(telomere_locations_file, G, paths, rdna_nodes, 100000, 5000000, False)
experimental_long_arm_path_ids = sf.get_arm_paths_ids(telomere_locations_file, G, paths, rdna_nodes, 10000000, 999999999, True)
same_component_long_arm_path_ids = sf.get_same_component_paths(short_arm_paths_ids, G, paths, 10000000,999999999)
for idd in same_component_long_arm_path_ids.keys():
if idd not in experimental_long_arm_path_ids:
experimental_long_arm_path_ids[idd] = same_component_long_arm_path_ids[idd]
print (f"Component-based added path {idd}")
print (f"short arms PATH number{len(short_arm_paths_ids)}")
for idd in short_arm_paths_ids.keys():
print(paths.getPathById(idd))

print (short_arm_paths_ids)
#parse translation_hap files

#getting long arms
#haplotype2-0000054 chr22 45121307 51324926
#chr names in general case not available

#Should not be run in normal runs
if debug and os.path.exists(translation_hap1_file) and os.path.exists(translation_paths_file):
print("Somehow you are deeply in verkko's debug, is it intentional?")
long_arm_paths = {}
multiplicities = {}
acros = {"chr13", "chr14", "chr15", "chr21", "chr22"}
long_arms = {}
for i in range (1, 3):
trans_file = os.path.join (hicrun_dir, os.pardir, "translation_hap" + str(i))
for line in open(trans_file):
[id, chr_num, plen, chr_len] = line.split()
if chr_num in acros and int(plen) < int(chr_len):
long_arm = chr_num + "_" + str(i)
#let's use longest assignment that is shorter than fixed known chr length
if long_arm in long_arms and long_arms[long_arm][1]> int(plen):
continue
long_arms[long_arm] =[id, int(plen)]
chr_to_rukkiid = {}
rukkiid_to_name = {}
for line in open(translation_paths_file):
arr = line.strip().split()
if len(arr) == 3 and arr[0] == "path":
chr_to_rukkiid[arr[1]] = arr[2]
rukkiid_to_name[arr[2]] = arr[1]

print ("long arm rukki ids:")
mapping_based_long_arm_path_ids = set()
for long_arm in sorted(long_arms.keys()):
mapping_based_long_arm_path_ids.add(chr_to_rukkiid[long_arms[long_arm][0]])
print ("experimental but not mapping based:")
print (experimental_long_arm_path_ids.keys() - mapping_based_long_arm_path_ids)

print ("Mapping based but not experimental based:")
print (mapping_based_long_arm_path_ids - experimental_long_arm_path_ids.keys())


long_arm_paths = {}
for long_arm in sorted(experimental_long_arm_path_ids.keys()):
long_arm_paths[long_arm] = paths.getPathById(long_arm)
print (f"Total long arms found {len(experimental_long_arm_path_ids.keys())}")


multiplicities = sf.get_multiplicities(paths)



components = {}
component_id = 0
node_to_component = {}
for line in open(hicverkko_log):
arr = line.strip().split()
if arr[0] == "Connected":
component_id += 1
utigs = re.findall(r"'(utig\d+-\d+)'", line)
for utig in utigs:
node_to_component[utig] = component_id
components[component_id] = utigs

longarm_to_component = {}
longarm_component_nodes = {}
component_to_longarm = {}
for long_arm in sorted(experimental_long_arm_path_ids.keys()):
path = long_arm_paths[long_arm]
print (f"long arm {long_arm} path {path}")
comp_w = {}
for component_id in components.keys():
comp_w[component_id] = 0
for node in path:
if node in node_to_component:
comp_w[node_to_component[node]] += G.nodes[node+'+']['length']
majority_component = max(comp_w, key=comp_w.get)
longarm_to_component[long_arm] = majority_component
if not majority_component in component_to_longarm:
component_to_longarm[majority_component] = []
longarm_component_nodes[majority_component] = []
component_to_longarm[majority_component].append(long_arm)
longarm_component_nodes[majority_component].extend(long_arm_paths[long_arm])

print("long arm components")
print(longarm_to_component)


weights_map = {}
for line in open (hic_byread):
arr = line.strip().split()
if not arr[1]+'+' in weights_map:
for dir in ["+", "-"]:
weights_map[arr[1] + dir] = {}
if not arr[2]+'+' in weights_map:
for dir in ["+", "-"]:
weights_map[arr[2] + dir] = {}
for dir1 in ["+", "-"]:
for dir2 in ["+", "-"]:
weights_map[arr[1]+dir1][arr[2] + dir2] = int(arr[3])
weights_map[arr[2]+dir1][arr[1] +dir2] = int(arr[3])

#path scoring
final_assignment = {}

tsv_out = {}
#if one long assigned to multiple short - do not change any and report weirdness
long_arm_multiplicities = {}
for short_id in short_arm_paths_ids.keys():
short_arm_nodes = paths.getPathById(short_id)
# print (weights_map[short_id])
max_score = 0
max_long_arm = "none"
best_path = sf.get_best_path(short_id, long_arm_paths, paths, longarm_to_component, multiplicities, weights_map, G)
final_assignment[short_id] = best_path
if not best_path in long_arm_multiplicities:
long_arm_multiplicities[best_path] = 0
long_arm_multiplicities[best_path] += 1
for short_id in final_assignment.keys():
best_path = final_assignment[short_id]
if best_path != "Unclear":
if long_arm_multiplicities[best_path] > 1:
error_str = f"Multiple short arms assigned to one long arm {best_path}, not scaffolding!"
for id in final_assignment.keys():
if final_assignment[id] == best_path:
error_str += f" {id}"
final_assignment[id] = "Unclear"
print (error_str)
else:
to_output = []
if experimental_long_arm_path_ids[best_path]:
to_output.extend(paths.getPathById(best_path))
else:
to_output.extend(sf.rc_path(paths.getPathById(best_path)))
to_output.append("[N1000001N:rdna]")
if short_arm_paths_ids[short_id]:
to_output.extend(sf.rc_path(paths.getPathById(short_id)))
else:
to_output.extend(paths.getPathById(short_id))
res_path = ','.join(to_output)
tsv_out[best_path] = res_path
tsv_out[short_id] = "NONE"

with open(scaff_rukki_tsv_file, "w") as out_tsv:
for line in open(rukki_tsv_file):
arr = line.strip().split()
if arr[0] in tsv_out:
if tsv_out[arr[0]] == "NONE":
continue
else:
out_tsv.write(arr[0] + "\t" + tsv_out[arr[0]] + "\t" + arr[2] + "\n")
else:
out_tsv.write(line)
with open(scaff_rukki_gaf_file, "w") as out_gaf:
for line in open(scaff_rukki_tsv_file):
arr = line.strip().split()
out_gaf.write(arr[0] + "\t" + gf.tsv2gaf(arr[1]) + "\t" + arr[2] + "\n")

#Additional output for debugging only
scores = {}
assgnd = 0
for short_id in short_arm_paths_ids.keys():
print (f"Assigned PATH: {short_id} --- {final_assignment[short_id]}")
if final_assignment[short_id] != "Unclear":
assgnd += 1
scores[short_id] = {}
for long_arm in long_arm_paths.keys():
scores[short_id][long_arm] = sf.scorepath(paths.getPathById(short_id), long_arm_paths[long_arm], multiplicities, weights_map, True)
print (f"All scores path using homozygous: {short_id} --- {scores[short_id]}\n")

scores[short_id] = {}
for long_arm in long_arm_paths.keys():
scores[short_id][long_arm] = sf.scorepath(paths.getPathById(short_id), long_arm_paths[long_arm], multiplicities, weights_map, False)
print (f"All scores NOT using homozygous: {short_id} --- {scores[short_id]}\n")

print (f"total paths assigned {assgnd} of {len(short_arm_paths_ids)}")
18 changes: 11 additions & 7 deletions src/scripts/scaffolding/scaffold_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,17 +166,20 @@ def nodeToPathDist(self, node, path, check_homologous):
closest = 1000000000
add_dist = 0
for path_node in path:
#self.logger.debug (f"Checking nodepair dist from {node} to {path_node}")
if path_node in self.upd_G.nodes:
if path_node in self.dists[node]:
if path_node == node:
closest = 0
closest = min(closest, self.dists[node][path_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[path_node.strip("-+")])
if check_homologous:
for hom_node in self.homologousOrNodes(path_node):
if hom_node == node:
closest = 0
if hom_node in self.dists[node]:
closest = min(closest, self.dists[node][hom_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[hom_node.strip("-+")])
else:
closest = min(closest, self.dists[node][path_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[path_node.strip("-+")])
if check_homologous:
for hom_node in self.homologousOrNodes(path_node):
#self.logger.debug (f"Checking homologous nodepair dist from {node} to {hom_node}")
if hom_node == node:
closest = 0
elif hom_node in self.dists[node]:
closest = min(closest, self.dists[node][hom_node] + add_dist - self.compressed_lens[node.strip("-+")] - self.compressed_lens[hom_node.strip("-+")])
add_dist += 2* self.compressed_lens[path_node.strip("-+")]
return closest/2
#Optionally allowing to use homologous nodes (to improve in gaps)
Expand All @@ -189,6 +192,7 @@ def pathDist(self, path_from, path_to, check_homologous):
closest = min(closest, self.nodeToPathDist(node, path_to, check_homologous) + add_dist)
if check_homologous:
for hom_node in self.homologousOrNodes(node) :
#self.logger.debug (f"Checking homologous dist from {hom_node} to {path_to} add_dist {add_dist}")
closest = min(closest, (self.nodeToPathDist(hom_node, path_to, check_homologous) + add_dist))
add_dist += 2* self.compressed_lens[node.strip("-+")]
return closest
Expand Down

0 comments on commit f6da468

Please sign in to comment.