Skip to content

Commit

Permalink
fixes to layout from master and cherry-pick
Browse files Browse the repository at this point in the history
  • Loading branch information
skoren committed Nov 20, 2023
1 parent 8d984cf commit a0bb3ab
Showing 1 changed file with 23 additions and 1 deletion.
24 changes: 23 additions & 1 deletion src/scripts/get_layout_from_mbg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@

mapping_file = sys.argv[1]
edge_overlap_file = sys.argv[2]

#Read alignments to the graph. Only hifi alignments to mbg and gaps are currently used
read_alignment_file = sys.argv[3]

#Either paths from rukki or just single nodes
paths_file = sys.argv[4]

nodelens_file = sys.argv[5]
layout_output = sys.argv[6]
layscf_output = sys.argv[7]
Expand All @@ -23,6 +28,7 @@ def canon(left, right):
return (revnode(right), revnode(left))
return (left, right)

#transform paths to base elements - mbg nodes and gaps.
def get_leafs(path, mapping, edge_overlaps, raw_node_lens):
path_len = 0
for i in range(0, len(path)):
Expand Down Expand Up @@ -81,6 +87,9 @@ def get_leafs(path, mapping, edge_overlaps, raw_node_lens):
assert current_len == path_len
return (result, overlaps)

#this gives us matches of individual read to contigs(= rukki paths or isolated utig4 node)
#path is an alignment of read (for most cases, hifi read to utig1 graph)
#node_poses - map from nodes to the list of contigs and their positions in contigs
def get_matches(path, node_poses, contig_nodeseqs, raw_node_lens, edge_overlaps, pathleftclip, pathrightclip, readleftclip, readrightclip, readlen, readstart, readend, gap):
longest = None
result = []
Expand Down Expand Up @@ -226,6 +235,9 @@ def get_exact_match_length(clusters):
assert left_len == right_len - left_clip - right_clip

pieceid = 0

#all these contains info about contigs - here nodes or rukki paths splitted by N
#paths are transformed into mbg nodes and gaps with get_leafs procedure
contig_lens = {}
contig_nodeseqs = {}
contig_nodeoverlaps = {}
Expand All @@ -246,6 +258,7 @@ def get_exact_match_length(clusters):
contig_pieces[fullname] = []

for pp in pathfull:
#pp is either path without gaps or gap. In latest case do nothing
gp = re.match(r"\[(N\d+N)(?:[^\]]+){0,1}\]", pp)
if gp:
contig_pieces[fullname].append("[" + gp.group(1) + "]")
Expand Down Expand Up @@ -282,6 +295,7 @@ def get_exact_match_length(clusters):

contig_pieces[fullname].append("end")

#map from node to list of contigs where it occures
node_poses = {}
for contigname in contig_nodeseqs:
for i in range(0, len(contig_nodeseqs[contigname])):
Expand All @@ -297,6 +311,7 @@ def get_exact_match_length(clusters):
next_read_id = 0

matches_per_read = {}
readname_to_paths = {}
with open(read_alignment_file) as f:
for l in f:
parts = l.strip().split('\t')
Expand All @@ -319,6 +334,8 @@ def get_exact_match_length(clusters):
if node[1:4] == "gap":
gap = True
break
readname_to_paths[readname] = [path, gap]

matches = get_matches(path, node_poses, contig_nodeseqs, raw_node_lens, edge_overlaps, pathleftclip, pathrightclip, readleftclip, readrightclip, readlen, readstart, readend, gap)
if len(matches) == 0: continue
if readname not in matches_per_read: matches_per_read[readname] = []
Expand Down Expand Up @@ -360,7 +377,11 @@ def get_exact_match_length(clusters):
len_readend = readlen
contig_contains_reads[contig][readname].append((contigpos + readlen, contigpos, len_readstart, len_readend, readlen, readstart, readend))

#here we clusterize separate matches to the nodes in the path to clusters (by match position in contig)

read_clusters = {}
total_banned = 0
total_notbanned = 0
for contig in contig_contains_reads:
print (f"debugggging {contig}")
cont_seq = ""
Expand Down Expand Up @@ -424,7 +445,6 @@ def get_exact_match_length(clusters):
else:
total_notbanned +=1

>>>>>>> 50e523e (working version)
lines.sort(key=lambda x: min(x[0], x[1]))
fwcluster = None
bwcluster = None
Expand All @@ -440,6 +460,7 @@ def get_exact_match_length(clusters):
if fw:
if fwcluster is None:
fwcluster = (contigstart, contigend, readstart, readend, [(real_readstart, real_readend)])
#this is the place where alignment to different nodes are actually united
elif contigstart < fwcluster[0] + 50 and contigend < fwcluster[1] + 50:
fwcluster = (contigstart, contigend, min(fwcluster[2], readstart), max(fwcluster[3], readend), fwcluster[4] + [(real_readstart, real_readend)])
else:
Expand All @@ -463,6 +484,7 @@ def get_exact_match_length(clusters):
read_clusters[readname].append((contig, bwcluster[1], bwcluster[0], get_exact_match_length(bwcluster[4])))
print (f"Banned {total_banned} allowed {total_notbanned}")
contig_actual_lines = {}
#selecting best alignment to the contig, if multiple best use all of them.
for readname in read_clusters:
longest = []
for line in read_clusters[readname]:
Expand Down

0 comments on commit a0bb3ab

Please sign in to comment.