Skip to content

Commit

Permalink
Adding reference based check
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitry-Antipov committed Jun 5, 2024
1 parent b78990e commit 59594ec
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 27 deletions.
14 changes: 7 additions & 7 deletions src/scripts/scaffolding/match_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def __init__(self, node1, node2, len1, len2):

#orientation defined as orientation of the largest match
self.largest_interval = 0
self.largest_interval_center = [0, 0]
self.orientation = ""

#TODO: should be used for homology check in scaffolding, with specific IDY cutoff and not sorted
Expand All @@ -39,6 +40,7 @@ def addInterval(self, intervals, orientation, idy):
if self.largest_interval < int_len:
self.largest_interval = int_len
self.orientation = orientation
self.largest_interval_center = [(intervals[0][1] + intervals[0][0])/2, (intervals[1][1] + intervals[1][0])/2]

def fillCoverage(self):
for i in range(0, 2):
Expand Down Expand Up @@ -85,6 +87,7 @@ def __init__(self, logger, mashmap_file, min_alignment):
self.addHomology(arr[0], arr[5], int(arr[1]), int(arr[6]), [[int(arr[2]), int(arr[3])], [int(arr[7]), int(arr[8])]], arr[4], arr[12])
self.logger.info(f"Loaded {used_lines} out of {total_lines} mashmap lines")
self.logger.info(f"{len(self.homologies)} nodes have at least one used homology")
self.lens = {}
self.fillCoverage()

#do we want to add other direction of pair?
Expand All @@ -94,7 +97,8 @@ def addHomology(self, node1, node2, len1, len2, intervals, orientation, idy):
if not node2 in self.homologies[node1]:
self.homologies[node1][node2] = HomologyInfo(node1, node2, len1, len2)
self.homologies[node1][node2].addInterval(intervals, orientation, idy)

self.lens[node1] = len1
self.lens[node2] = len2
#covered length of homology
def fillCoverage(self):
for node1 in self.homologies:
Expand All @@ -107,12 +111,8 @@ def isValid(self, node1, node2):
else:
return False
#extracting length, sometimes we do not haev this info in other places
def getLen(self, node1):
if node1 in self.homologies and len (self.homologies[node1]) > 0:
for node2 in self.homologies[node1]:
return self.homologies[node1][node2].len[0]
else:
return 0
def getLength(self, node):
return self.lens(node)

# homology_weight - large negative something, min_big_homology - minimal length of homology to be taken in account.
# Some shorter ones can still sometimes be used if they are in regular bulge_like structure
Expand Down
93 changes: 73 additions & 20 deletions src/scripts/scaffolding/scaffold_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
from scaffolding import logger_wrap, match_graph, path_storage

#TODO: or inherit from nx.Digraph??
class ReferencePosition:
def __init__(self, name_q, name_r, average_pos, query_len, orientation):
self.name_q = name_q
self.name_r = name_r
self.average_pos = average_pos
self.query_len = query_len
self.orientation = orientation

class ScaffoldGraph:
LONG_HAPLOID_CUTOFF = 5000000
#should not be actuallly used, since they'll be reordered with MAX_REORDERING_LENGTH check
Expand All @@ -32,25 +40,29 @@ class ScaffoldGraph:
#Distance is defined with respect to homologous paths to allow "gap jumping"
#if one of orientation is relatively close in graph(<min(1/4*path_length, CLOSE_IN_GRAPH) and other is really far (>3/4 of the length), we move all connections from far one to close
CLOSE_IN_GRAPH = 500000

#If paths are closer than CLOSE_IN_GRAPH, we significantly increase scores. Should be reconsidered when lots of gaps present
#can be asymetric because of the 1/4 path_length rule, possibly should reconsider it
CONNECTIVITY_MULTIPLICATIVE_BONUS = 2



#default values for MatchGraph construction
MATCHGRAPH_LONG_NODE = 500000
MATCHGRAPH_MIN_ALIGNMENT = 100000


#reference-based/haplotype-reference based params
#shorter homologies will be ignored, possibly can be tuned in mashmap options?
MIN_HOMOLOGY_REF = 100000
#Quite conservative here, best homology contig should be twice longer than second best
RATIO_HOMOLOGY_REF = 2.0

#If best is not covered by homologous intervals good enough - ignore TODO not sure whether we need it
#If best alignment is not covered by homologous intervals good enough - ignore
#TODO not sure whether we need it
LEN_FRAC_REF = 0.8
#Giving ref bonus for neighboring paths if dist is smaller than
ABSOLUTE_ALLOWED_REFERENCE_JUMP = 2000000
#Consequtive paths scores are increased by this factor.
#TODO Possibly should be some higher absolute constant?
REFERENCE_MULTIPLICATIVE_BONUS = 4



Expand Down Expand Up @@ -81,8 +93,10 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat
for node in self.upd_G.nodes:
self.compressed_lens[node] = self.upd_G.nodes[node]['length']
self.compressed_lens[node.strip("-+")] = self.upd_G.nodes[node]['length']


#positions on refence or other haplotype
self.reference_positions = {}
self.assigned_reference = {}
self.G = G
self.dists = dict(nx.all_pairs_dijkstra_path_length(self.upd_G, weight=lambda u, v, d: self.upd_G.edges[u, v]['mid_length']))
self.logger.info("Pairwise distances in assembly graph calculated")
Expand Down Expand Up @@ -181,11 +195,12 @@ def getHaploidPaths(self):
path_len = self.rukki_paths.getLength(nor_path_id)
if total_hom * 2 < path_len:
haploids.add(nor_path_id)
#DEBUG ONLY
if path_len > 2000000:
self.logger.info(f"Found haploid path {nor_path_id} with homology {total_hom} and len {path_len} ")
return haploids


def getClosestTelomere(self, path, direction):
#From telomere to rc(path_end)
if direction == '+':
Expand Down Expand Up @@ -239,21 +254,53 @@ def pathDist(self, path_from, path_to, check_homologous):

def getPathPositions(self, path_mashmap):
hom_storage = match_graph.HomologyStorage(path_mashmap, ScaffoldGraph.MIN_HOMOLOGY_REF)
for node1 in hom_storage.homologies:
best_nodes = []
for node2 in hom_storage.homologies[node1]:
best_nodes.append([hom_storage[node1][node2].getCoveredLen(node1, node2), node2])
best_nodes.sort(reverse=True)
if len(best_nodes) == 1 or best_nodes[0][0] > ScaffoldGraph.RATIO_HOMOLOGY_REF * best_nodes[1][0]:
best_node = best_nodes[0][1]
self.logger.debug(f"Clearly best homology for {node1} is {best_node} with size{best_nodes[0][0]}")
if best_nodes[0][0] > ScaffoldGraph.LEN_FRAC_REF * self.hom_storage.getLength(node1):
self.logger.debug(f"Best homology for {node1} is {best_node} with size{best_nodes[0][0]}")

#TODO:
for path_id in hom_storage.homologies:
all_refs = []
for ref_id in hom_storage.homologies[path_id]:
all_refs.append([hom_storage[path_id][ref_id].getCoveredLen(path_id, ref_id), ref_id])
all_refs.sort(reverse=True)
if len(all_refs) == 1 or all_refs[0][0] > ScaffoldGraph.RATIO_HOMOLOGY_REF * all_refs[1][0]:
best_ref = all_refs[0][1]
self.logger.debug(f"Clearly best homology for {path_id} is {best_ref} with size{all_refs[0][0]}")
if all_refs[0][0] > ScaffoldGraph.LEN_FRAC_REF * self.hom_storage.getLength(path_id):
self.logger.debug(f"Best homology for {path_id} is {best_ref} with size{all_refs[0][0]}")
if not best_ref in self.reference_positions:
self.reference_positions[best_ref] = []
hom_info = hom_storage[path_id][best_ref]
self.assigned_reference[path_id + '+'] = best_ref + hom_info.orientaion
self.assigned_reference[path_id + '-'] = best_ref + self.rc_orientation(hom_info.orientaion)
# self.assigned_reference[path_id] = best_ref
# self.reference_positions[best_ref].append(ReferencePosition(path_id, best_ref, hom_info.largest_interval_center[1], self.hom_storage.getLength(path_id), hom_info.orientaion))
self.reference_positions[best_ref + "+"].append(ReferencePosition(path_id + hom_info.orientaion, best_ref + "+", hom_info.largest_interval_center[1], self.hom_storage.getLength(path_id), hom_info.orientaion))
self.reference_positions[best_ref + "-"].append(ReferencePosition(path_id + self.rc_orientation(hom_info.orientation), best_ref + '-', self.hom_storage.getLength(best_ref) - hom_info.largest_interval_center[1], self.hom_storage.getLength(path_id), self.rc_orientation(hom_info.orientaion)))

else:
self.logger.debug(f"Best homology for {node1} is {best_node} not covered enough frac {best_nodes[0][0] / self.hom_storage.getLength(node1)}")

self.logger.debug(f"Best homology for {path_id} is {best_ref} not covered enough frac {all_refs[0][0] / self.hom_storage.getLength(path_id)}")
for ref in self.reference_positions:
self.logger.debug(f"Reference positions for {ref}")
self.reference_positions[ref] = sorted(self.reference_positions[ref], key=lambda x: x.average_pos)
for pos in self.reference_positions[ref]:
self.logger.debug(f"{pos.name_q} {pos.name_r} {pos.average_pos} {pos.query_len} {pos.orientation}")


def isNextByRef(self, from_path_id, to_path_id):
nor_from_id = from_path_id.strip('-+')
nor_to_id = to_path_id.strip('-+')
if (not (nor_from_id in self.assigned_reference) or not (nor_to_id in self.assigned_reference)):
return False
if self.assigned_reference[nor_from_id] != self.assigned_reference[nor_to_id]:
return False
aligned = self.reference_positions[self.assigned_reference[nor_from_id]]
len_aligned = len(aligned)
for i in range(0, len_aligned - 1):
if aligned[i].name_q == from_path_id and aligned[i+1].name_q == to_path_id:
ref_jump = aligned[i+1].average_pos - aligned[i].average_pos
query_jump = (aligned[i+1].query_len + aligned[i].query_len) / 2
if abs(ref_jump - query_jump) < ScaffoldGraph.ABSOLUTE_ALLOWED_REFERENCE_JUMP:
return True
else:
return False
return False

def forbiddenPair(self, from_path_id, to_path_id):
nor_from_path_id = from_path_id.strip('-+')
Expand Down Expand Up @@ -640,6 +687,12 @@ def getPathPairConnections(self, path_ids, connections, lens):
scores[key] += scores_pair[key]
self.logger.debug (f"Pathscores for {path_ids} {scores}")
scores = self.fixOrientation(path_ids, scores)
for orientation in scores:
if len(orientation) == 2:
if self.isNextByRef(path_ids[0] + orientation[0], path_ids[1] + orientation[1]):
self.logger.debug (f"Reference connection found! {path_ids} {orientation}")
scores[orientation] *= self.REFERENCE_MULTIPLICATIVE_BONUS
scores[orientation] = scores[orientation] / (len(connections[(path_ids[0], path_ids[1])]) + 1)
return scores

#returns dict, {id:[present_start_relo, present_end_telo]}
Expand Down

0 comments on commit 59594ec

Please sign in to comment.