diff --git a/src/scripts/cluster.py b/src/scripts/cluster.py index 6788abe5..2bf93ecc 100755 --- a/src/scripts/cluster.py +++ b/src/scripts/cluster.py @@ -178,7 +178,8 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une MAX_COV = 100 # tempora# ry coverage cutoff, currently replaced by median coverage from gfa FIXED_WEIGHT = 100000 # best result so far with 100000 #currently replaced with max pairwise weight among datasets - + MAX_RDNA_COMPONENT = 10000000 # maximal size of rDNA component, used for filtering out rDNA cluster only + MIN_RDNA_COMPONENT = 500000 # load the assembly gfa G = nx.Graph() logging_f = open (os.path.join(output_dir, LOGGING_FILENAME), 'w') @@ -202,9 +203,9 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une if not (no_rdna): #Store rDNA component, not to add additional links from matchGraph - sys.stderr.write(f"Found an rDNA huge component of {len(largest_component)} edges\n") + #sys.stderr.write(f"Found an rDNA huge component of {len(largest_component)} edges\n") #Here we remove large connected components of short edge, just to exclude rDNA cluster - delete_set = graph_functions.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT) + delete_set = graph_functions.remove_large_tangles(G, MAX_SHORT_LEN, MAX_SHORT_COMPONENT,MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT) else: sys.stderr.write(f"Not using rDNA component removal heuristics\n") filtered_graph = open(os.path.join(output_dir, FILTERED_GFA_FILENAME), 'w') diff --git a/src/scripts/graph_functions.py b/src/scripts/graph_functions.py index 8380ce68..ce5b4d37 100755 --- a/src/scripts/graph_functions.py +++ b/src/scripts/graph_functions.py @@ -1,8 +1,17 @@ import networkx as nx import sys + +def get_component_length(G, component): + res = 0 + for node in component: + res += G.nodes[node]['length'] + return res + #Functions for indirect graph processing, nodes - "utig1-234" -def remove_large_tangles(G, MAX_LEN, MAX_SHORT_COMPONENT): + +#TODO: here should be a better check, with either rDNA nodes or diploid structure. After scaff refactor +def remove_large_tangles(G, MAX_LEN, MAX_SHORT_COMPONENT, MIN_RDNA_COMPONENT, MAX_RDNA_COMPONENT): shorts = set() for node in G.nodes(): if G.nodes[node]['length'] < MAX_LEN: @@ -11,12 +20,36 @@ def remove_large_tangles(G, MAX_LEN, MAX_SHORT_COMPONENT): nodes_deleted = 0 components_deleted = 0 to_delete = [] - for comp in nx.connected_components(sh_G): + #largest tangle will be always removed (unless --no-rdna-tangle used which completely disable tangle removal heuristics) + First = True + + for comp in sorted(nx.connected_components(sh_G), key=len, reverse=True): if len(comp) > MAX_SHORT_COMPONENT: - components_deleted += 1 - for e in comp: - nodes_deleted += 1 - to_delete.append(e) +#lets check whether component looks similar to rdna + first_edge = list(comp)[0] + sys.stderr.write(f'Checking component with first edge {first_edge}\n') + first_edge_big_comp = nx.node_connected_component(G, first_edge) + not_deleting = set( ) + for e in first_edge_big_comp: + if not e in comp: + not_deleting.add(e) + subgraph_comp = G.subgraph(not_deleting) + new_connected = nx.connected_components(subgraph_comp) + good_new_connected = False + for new_comp in new_connected: + tlen = get_component_length(G, new_comp) + if tlen < MAX_RDNA_COMPONENT and tlen > MIN_RDNA_COMPONENT: + good_new_connected = True + sys.stderr.write(f'Component size {tlen}, small enough\n') + break + if First or good_new_connected: + First = False + components_deleted += 1 + for e in comp: + nodes_deleted += 1 + to_delete.append(e) + else: + sys.stderr.write(f'Not deleting component {comp}, nothing similar to rDNA\n') G.remove_nodes_from(to_delete) sys.stderr.write(f'Removed {components_deleted} short nodes components and {nodes_deleted} short nodes. New ' f'number of nodes {G.number_of_nodes()}\n')