Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tangle removal heuristics quick fix #251

Merged
merged 1 commit into from
May 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/scripts/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une
MAX_COV = 100 # tempora# ry coverage cutoff, currently replaced by median coverage from gfa
FIXED_WEIGHT = 100000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets


MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only
MIN_RDNA_COMPONENT = 500000
# load the assembly gfa
G = nx.Graph()
logging_f = open (os.path.join(output_dir, LOGGING_FILENAME), 'w')
Expand All @@ -202,9 +203,9 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une

if not (no_rdna):
#Store rDNA component, not to add additional links from matchGraph
sys.stderr.write(f"Found an rDNA huge component of {len(largest_component)} edges\n")
#sys.stderr.write(f"Found an rDNA huge component of {len(largest_component)} edges\n")
#Here we remove large connected components of short edge, just to exclude rDNA cluster
delete_set = graph_functions.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT)
delete_set = graph_functions.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT,MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT)
else:
sys.stderr.write(f"Not using rDNA component removal heuristics\n")
filtered_graph = open(os.path.join(output_dir, FILTERED_GFA_FILENAME), 'w')
Expand Down
45 changes: 39 additions & 6 deletions src/scripts/graph_functions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
import networkx as nx
import sys


def get_component_length(G, component):
res = 0
for node in component:
res += G.nodes[node]['length']
return res

#Functions for indirect graph processing, nodes - "utig1-234"
def remove_large_tangles(G, MAX_LEN, MAX_SHORT_COMPONENT):

#TODO: here should be a better check, with either rDNA nodes or diploid structure. After scaff refactor
def remove_large_tangles(G, MAX_LEN, MAX_SHORT_COMPONENT, MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT):
shorts = set()
for node in G.nodes():
if G.nodes[node]['length'] < MAX_LEN:
Expand All @@ -11,12 +20,36 @@ def remove_large_tangles(G, MAX_LEN, MAX_SHORT_COMPONENT):
nodes_deleted = 0
components_deleted = 0
to_delete = []
for comp in nx.connected_components(sh_G):
#largest tangle will be always removed (unless --no-rdna-tangle used which completely disable tangle removal heuristics)
First = True

for comp in sorted(nx.connected_components(sh_G), key=len, reverse=True):
if len(comp) > MAX_SHORT_COMPONENT:
components_deleted += 1
for e in comp:
nodes_deleted += 1
to_delete.append(e)
#lets check whether component looks similar to rdna
first_edge = list(comp)[0]
sys.stderr.write(f'Checking component with first edge {first_edge}\n')
first_edge_big_comp = nx.node_connected_component(G, first_edge)
not_deleting = set( )
for e in first_edge_big_comp:
if not e in comp:
not_deleting.add(e)
subgraph_comp = G.subgraph(not_deleting)
new_connected = nx.connected_components(subgraph_comp)
good_new_connected = False
for new_comp in new_connected:
tlen = get_component_length(G, new_comp)
if tlen < MAX_RDNA_COMPONENT and tlen > MIN_RDNA_COMPONENT:
good_new_connected = True
sys.stderr.write(f'Component size {tlen}, small enough\n')
break
if First or good_new_connected:
First = False
components_deleted += 1
for e in comp:
nodes_deleted += 1
to_delete.append(e)
else:
sys.stderr.write(f'Not deleting component {comp}, nothing similar to rDNA\n')
G.remove_nodes_from(to_delete)
sys.stderr.write(f'Removed {components_deleted} short nodes components and {nodes_deleted} short nodes. New '
f'number of nodes {G.number_of_nodes()}\n')
Expand Down
Loading