diff --git a/src/scripts/get_layout_from_mbg.py b/src/scripts/get_layout_from_mbg.py index 98d39caa..b6345325 100755 --- a/src/scripts/get_layout_from_mbg.py +++ b/src/scripts/get_layout_from_mbg.py @@ -362,11 +362,69 @@ def get_exact_match_length(clusters): read_clusters = {} for contig in contig_contains_reads: + print (f"debugggging {contig}") + cont_seq = "" + for i in range(0, len(contig_nodeseqs[contig])): + cont_seq += (contig_nodeseqs[contig][i][0]) + print (cont_seq) + + sys.stdout.flush() +#lets try to ban the reads that have a suffix or prefix that contradict to the contig + contig_ends = [contig_nodeseqs[contig][0][0], contig_nodeseqs[contig][-1][0]] + + contig_nodes_no_dir = set() + for i in range(0, len(contig_nodeseqs[contig])): + contig_nodes_no_dir.add(contig_nodeseqs[contig][i][0][1:]) + for readname in contig_contains_reads[contig]: if readname not in read_clusters: read_clusters[readname] = [] lines = contig_contains_reads[contig][readname] assert len(lines) > 0 readlen = lines[0][4] + + #lets not ban gap containing reads + if not readname_to_paths[readname][1]: + to_ban = False + near_contig_end = False + #if read_end not in contig, then something likely went wrong.. + #can be replaced by more accurate check to exclude case where read end is somewhere else, but it should not make big sense. + + to_ban_left = False + to_ban_right = False + if not (readname_to_paths[readname][0][0][1:] in contig_nodes_no_dir): + to_ban_left = True + if not (readname_to_paths[readname][0][-1][1:] in contig_nodes_no_dir): + to_ban_right = True + #lets always allow read to extend contig + contains_contig_ends = [False, False] + for node in readname_to_paths[readname][0]: + for i in range(0,2): + #nodes same without respect to the orientation + if node[1:] == contig_ends[i][1:]: + contains_contig_ends[i] = True + #read going different direction with contig, so should switch ban logic + #rc loop may confuse this logic, but it would be weird anyway. + '''if node[0] != contig_ends[i][0]: + tmp = to_ban_left + to_ban_left = to_ban_right + to_ban_right = tmp + print ("switching") + print (readname_to_paths[readname][0]) ''' + + + if to_ban_left and not(contains_contig_ends[0]): + to_ban = True + if to_ban_right and not(contains_contig_ends[1]): + to_ban = True + if to_ban: + print(f" {to_ban_left} {to_ban_right} banning") + print (readname_to_paths[readname][0]) + total_banned +=1 + continue + else: + total_notbanned +=1 + +>>>>>>> 50e523e (working version) lines.sort(key=lambda x: min(x[0], x[1])) fwcluster = None bwcluster = None @@ -385,7 +443,8 @@ def get_exact_match_length(clusters): elif contigstart < fwcluster[0] + 50 and contigend < fwcluster[1] + 50: fwcluster = (contigstart, contigend, min(fwcluster[2], readstart), max(fwcluster[3], readend), fwcluster[4] + [(real_readstart, real_readend)]) else: - if fwcluster[3] - fwcluster[2] >= readlen * min_read_len_fraction: read_clusters[readname].append((contig, fwcluster[0], fwcluster[1], get_exact_match_length(fwcluster[4]))) + if fwcluster[3] - fwcluster[2] >= readlen * min_read_len_fraction: + read_clusters[readname].append((contig, fwcluster[0], fwcluster[1], get_exact_match_length(fwcluster[4]))) fwcluster = (contigstart, contigend, readstart, readend, [(real_readstart, real_readend)]) else: if bwcluster is None: @@ -393,13 +452,16 @@ def get_exact_match_length(clusters): elif contigstart < bwcluster[0] + 50 and contigend < bwcluster[1] + 50: bwcluster = (contigstart, contigend, min(bwcluster[2], readstart), max(bwcluster[3], readend), bwcluster[4] + [(real_readstart, real_readend)]) else: - if bwcluster[3] - bwcluster[2] >= readlen * min_read_len_fraction: read_clusters[readname].append((contig, bwcluster[1], bwcluster[0], get_exact_match_length(bwcluster[4]))) + if bwcluster[3] - bwcluster[2] >= readlen * min_read_len_fraction: + read_clusters[readname].append((contig, bwcluster[1], bwcluster[0], get_exact_match_length(bwcluster[4]))) bwcluster = (contigstart, contigend, readstart, readend, [(real_readstart, real_readend)]) if fwcluster is not None: - if fwcluster[3] - fwcluster[2] >= readlen * min_read_len_fraction: read_clusters[readname].append((contig, fwcluster[0], fwcluster[1], get_exact_match_length(fwcluster[4]))) + if fwcluster[3] - fwcluster[2] >= readlen * min_read_len_fraction: + read_clusters[readname].append((contig, fwcluster[0], fwcluster[1], get_exact_match_length(fwcluster[4]))) if bwcluster is not None: - if bwcluster[3] - bwcluster[2] >= readlen * min_read_len_fraction: read_clusters[readname].append((contig, bwcluster[1], bwcluster[0], get_exact_match_length(bwcluster[4]))) - + if bwcluster[3] - bwcluster[2] >= readlen * min_read_len_fraction: + read_clusters[readname].append((contig, bwcluster[1], bwcluster[0], get_exact_match_length(bwcluster[4]))) +print (f"Banned {total_banned} allowed {total_notbanned}") contig_actual_lines = {} for readname in read_clusters: longest = []