fixes to layout from master and cherry-pick

marbl · Nov 20, 2023 · a0bb3ab · a0bb3ab
1 parent 8d984cf
commit a0bb3ab
Showing 1 changed file with 23 additions and 1 deletion.
diff --git a/src/scripts/get_layout_from_mbg.py b/src/scripts/get_layout_from_mbg.py
@@ -5,8 +5,13 @@
 
 mapping_file = sys.argv[1]
 edge_overlap_file = sys.argv[2]
+
+#Read alignments to the graph. Only hifi alignments to mbg and gaps are currently used
 read_alignment_file = sys.argv[3]
+
+#Either paths from rukki or just single nodes
 paths_file = sys.argv[4]
+
 nodelens_file = sys.argv[5]
 layout_output = sys.argv[6]
 layscf_output = sys.argv[7]
@@ -23,6 +28,7 @@ def canon(left, right):
 		return (revnode(right), revnode(left))
 	return (left, right)
 
+#transform paths to base elements - mbg nodes and gaps.
 def get_leafs(path, mapping, edge_overlaps, raw_node_lens):
 	path_len = 0
 	for i in range(0, len(path)):
@@ -81,6 +87,9 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens):
 		assert current_len == path_len
 	return (result, overlaps)
 
+#this gives us matches of individual read to contigs(= rukki paths or isolated utig4 node)
+#path is an alignment of read (for most cases, hifi read to utig1 graph)
+#node_poses - map from nodes to the list of contigs and their positions in contigs 
 def get_matches(path, node_poses, contig_nodeseqs, raw_node_lens, edge_overlaps, pathleftclip, pathrightclip, readleftclip, readrightclip, readlen, readstart, readend, gap):
 	longest = None
 	result = []
@@ -226,6 +235,9 @@ def get_exact_match_length(clusters):
 		assert left_len == right_len - left_clip - right_clip
 
 pieceid = 0
+
+#all these contains info about contigs - here nodes or rukki paths splitted by N
+#paths are transformed into mbg nodes and gaps with get_leafs procedure
 contig_lens = {}
 contig_nodeseqs = {}
 contig_nodeoverlaps = {}
@@ -246,6 +258,7 @@ def get_exact_match_length(clusters):
 		contig_pieces[fullname] = []
 
 		for pp in pathfull:
+			#pp is either path without gaps or gap. In latest case do nothing
 			gp = re.match(r"\[(N\d+N)(?:[^\]]+){0,1}\]", pp)
 			if gp:
 				contig_pieces[fullname].append("[" + gp.group(1) + "]")
@@ -282,6 +295,7 @@ def get_exact_match_length(clusters):
 
 		contig_pieces[fullname].append("end")
 
+#map from node to list of contigs where it occures
 node_poses = {}
 for contigname in contig_nodeseqs:
 	for i in range(0, len(contig_nodeseqs[contigname])):
@@ -297,6 +311,7 @@ def get_exact_match_length(clusters):
 next_read_id = 0
 
 matches_per_read = {}
+readname_to_paths = {}
 with open(read_alignment_file) as f:
 	for l in f:
 		parts = l.strip().split('\t')
@@ -319,6 +334,8 @@ def get_exact_match_length(clusters):
 			if node[1:4] == "gap":
 				gap = True
 				break
+		readname_to_paths[readname] = [path, gap]
+
 		matches = get_matches(path, node_poses, contig_nodeseqs, raw_node_lens, edge_overlaps, pathleftclip, pathrightclip, readleftclip, readrightclip, readlen, readstart, readend, gap)
 		if len(matches) == 0: continue
 		if readname not in matches_per_read: matches_per_read[readname] = []
@@ -360,7 +377,11 @@ def get_exact_match_length(clusters):
 				len_readend = readlen
 			contig_contains_reads[contig][readname].append((contigpos + readlen, contigpos, len_readstart, len_readend, readlen, readstart, readend))
 
+#here we clusterize separate matches to the nodes in the path to clusters (by match position in contig)
+
 read_clusters = {}
+total_banned = 0
+total_notbanned = 0
 for contig in contig_contains_reads:
 	print (f"debugggging  {contig}")
 	cont_seq = ""
@@ -424,7 +445,6 @@ def get_exact_match_length(clusters):
 			else:
 				total_notbanned +=1
 
->>>>>>> 50e523e (working version)
 		lines.sort(key=lambda x: min(x[0], x[1]))
 		fwcluster = None
 		bwcluster = None
@@ -440,6 +460,7 @@ def get_exact_match_length(clusters):
 			if fw:
 				if fwcluster is None:
 					fwcluster = (contigstart, contigend, readstart, readend, [(real_readstart, real_readend)])
+				#this is the place where alignment to different nodes are actually united
 				elif contigstart < fwcluster[0] + 50 and contigend < fwcluster[1] + 50:
 					fwcluster = (contigstart, contigend, min(fwcluster[2], readstart), max(fwcluster[3], readend), fwcluster[4] + [(real_readstart, real_readend)])
 				else:
@@ -463,6 +484,7 @@ def get_exact_match_length(clusters):
 				read_clusters[readname].append((contig, bwcluster[1], bwcluster[0], get_exact_match_length(bwcluster[4])))
 print (f"Banned {total_banned} allowed {total_notbanned}")
 contig_actual_lines = {}
+#selecting best alignment to the contig, if multiple best use all of them.
 for readname in read_clusters:
 	longest = []
 	for line in read_clusters[readname]: