Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow Redundant Paths to Consider Indels #19

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 14 additions & 32 deletions src/Source/Add-in/Bio.Padena/PathWithOrientation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,25 @@ namespace Bio.Algorithms.Assembly.Padena
/// Structure that stores list of nodes in path,
/// along with path orientation.
/// </summary>
public class PathWithOrientation
internal class PathWithOrientation
{
/// <summary>
/// Flag to indicate if this path is "Active" or still being extended.
/// </summary>
internal bool EndReached;

/// <summary>
/// List of nodes in path.
/// </summary>
private List<DeBruijnNode> nodes;
internal List<DeBruijnNode> Nodes;

/// <summary>
/// Initializes a new instance of the PathWithOrientation class.
/// </summary>
/// <param name="node1">First node to add.</param>
/// <param name="node2">Second node to add.</param>
/// <param name="orientation">Path orientation.</param>
public PathWithOrientation(DeBruijnNode node1, DeBruijnNode node2, bool orientation)
internal PathWithOrientation(DeBruijnNode node1, DeBruijnNode node2, bool orientation)
{
if (node1 == null)
{
Expand All @@ -33,38 +38,15 @@ public PathWithOrientation(DeBruijnNode node1, DeBruijnNode node2, bool orientat
throw new ArgumentNullException("node2");
}

this.nodes = new List<DeBruijnNode> { node1, node2 };
this.IsSameOrientation = orientation;
}

/// <summary>
/// Initializes a new instance of the PathWithOrientation class.
/// Copies the input path info to a new one.
/// </summary>
/// <param name="other">Path info to copy.</param>
public PathWithOrientation(PathWithOrientation other)
{
if (other == null)
{
throw new ArgumentNullException("other");
}

this.nodes = new List<DeBruijnNode>(other.Nodes);
this.IsSameOrientation = other.IsSameOrientation;
}

/// <summary>
/// Gets the list of nodes in path.
/// </summary>
public IList<DeBruijnNode> Nodes
{
get { return this.nodes; }
this.Nodes = new List<DeBruijnNode> { node1, node2 };
this.GrabNextNodesOnLeft = orientation;
this.EndReached = false;
}

/// <summary>
/// Gets or sets a value indicating whether path orientation is same or opposite
/// with respect to the start node of the path.
/// Indicates if at the end of the path, the next nodes should come from the
/// left or right extensions
/// </summary>
public bool IsSameOrientation { get; set; }
internal bool GrabNextNodesOnLeft;
}
}
199 changes: 133 additions & 66 deletions src/Source/Add-in/Bio.Padena/RedundantPathsPurger.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using System.Collections.Generic;

using Bio.Algorithms.Assembly.Graph;

namespace Bio.Algorithms.Assembly.Padena
Expand Down Expand Up @@ -52,6 +54,9 @@ public string Description

/// <summary>
/// Gets or sets threshold for length of redundant paths.
///
/// Given two diverging paths leaving a node, we extend the paths for a maximum up to LengthThreshold
/// looking for it to converge with itself before giving up.
/// </summary>
public int LengthThreshold
{
Expand All @@ -74,15 +79,12 @@ public int LengthThreshold
/// <returns>List of path nodes to be deleted.</returns>
public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph)
{
if (deBruijnGraph == null)
{
throw new ArgumentNullException("deBruijnGraph");
}

DeBruijnGraph.ValidateGraph(deBruijnGraph);
this.graph = deBruijnGraph;

// List of the collection of redundant paths, passed in to method to be filled
// TODO: Paths are tranversed and returned in both directions here: we should probably simplify...
List<DeBruijnPathList> redundantPaths = new List<DeBruijnPathList>();

Parallel.ForEach(
deBruijnGraph.GetNodes(),
node =>
Expand All @@ -98,7 +100,7 @@ public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph)
TraceDivergingExtensionPaths(node, node.GetLeftExtensionNodesWithOrientation(), false, redundantPaths);
}
});

// Now to check that for each path they all go in the same way.
redundantPaths = RemoveDuplicates(redundantPaths);
return DetachBestPath(redundantPaths);
}
Expand All @@ -110,10 +112,6 @@ public DeBruijnPathList DetectErroneousNodes(DeBruijnGraph deBruijnGraph)
/// <param name="nodesList">Path nodes to be deleted.</param>
public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList nodesList)
{
if (this.graph == null)
{
throw new ArgumentNullException("deBruijnGraph");
}

DeBruijnGraph.ValidateGraph(deBruijnGraph);

Expand All @@ -131,6 +129,7 @@ public void RemoveErroneousNodes(DeBruijnGraph deBruijnGraph, DeBruijnPathList n
// Update extensions for deletion
// No need for read-write lock as deleteNode's dictionary is being read,
// and only other graph node's dictionaries are updated.
//TODO: Every link is connected to another, we should only remove the deleted nodes, not iterate over all nodes.
Parallel.ForEach(
deleteNodes,
node =>
Expand Down Expand Up @@ -176,33 +175,36 @@ private static DeBruijnPathList ExtractBestPath(DeBruijnPathList divergingPaths)
DeBruijnPath bestPath = divergingPaths.Paths[bestPathIndex];
divergingPaths.Paths.RemoveAt(bestPathIndex);

// There can be overlap between redundant paths.
// Remove path nodes that occur in best path
/* There can be overlap between redundant paths and non-redundant paths
* Remove path nodes that occur in best path or that are part of an unrelated path
* e.g. A->B->C->D overlaps with E->F->G->C->D so C should be preserved even
* if not part of the best path. */
foreach (var path in divergingPaths.Paths)
{
path.RemoveAll(n => bestPath.PathNodes.Contains(n));
{
// condition below should include the condition bestPath.PathNodes.Contains(n)
path.RemoveAll(n => n.LeftExtensionNodesCount > 1 || n.RightExtensionNodesCount > 1);
}

return divergingPaths;
}

/// <summary>
/// Gets the best path from the list of diverging paths.
/// Path that has maximum sum of 'count' of belonging k-mers is best.
/// Path that has maximum average of 'count' of belonging k-mers is best.
/// In case there are multiple 'best' paths, we arbitrarily return one of them.
/// </summary>
/// <param name="divergingPaths">List of diverging paths.</param>
/// <returns>Index of the best path.</returns>
private static int GetBestPath(DeBruijnPathList divergingPaths)
{
// We find the index of the 'best' path.
long max = -1;
double max = -1;
int maxIndex = -1;

// Path that has the maximum sum of 'count' of belonging k-mers is the winner
// Path that has the maximum average of 'count' of belonging k-mers is the winner
for (int i = 0; i < divergingPaths.Paths.Count; i++)
{
long sum = divergingPaths.Paths[i].PathNodes.Sum(n => n.KmerCount);
double sum = divergingPaths.Paths[i].PathNodes.Average(n => (double)n.KmerCount);
if (sum > max)
{
max = sum;
Expand Down Expand Up @@ -288,7 +290,7 @@ private static List<DeBruijnPathList> RemoveDuplicates(List<DeBruijnPathList> re
/// </summary>
/// <param name="startNode">Node at starting point of divergence.</param>
/// <param name="divergingNodes">List of diverging nodes.</param>
/// <param name="isForwardExtension">Bool indicating direction of divergence.</param>
/// <param name="isForwardExtension">Bool indicating direction of divergence. (Right = true)</param>
/// <param name="redundantPaths">List of redundant paths.</param>
private void TraceDivergingExtensionPaths(
DeBruijnNode startNode,
Expand All @@ -298,67 +300,132 @@ private void TraceDivergingExtensionPaths(
{
List<PathWithOrientation> divergingPaths = new List<PathWithOrientation>(
divergingNodes.Select(n =>
new PathWithOrientation(startNode, n.Key, n.Value)));
int divergingPathLengh = 2;

new PathWithOrientation(startNode, n.Key, (isForwardExtension ^ n.Value))));
int divergingPathLength = 2;

/* These are nodes with >= 2 coming in as the
* in the same direction as a path we are following. If two paths
* both enter the same node from the same direction, they can be redundant.
*/
HashSet<DeBruijnNode> possibleEndNodes = new HashSet<DeBruijnNode>();
int finishedCount = 0;
// Extend each path in cluster. While performing path extension
// also keep track of whether they have converged, which we indicate by setting
// this to the first node that two paths both encounter.
DeBruijnNode convergentNode = null;
// Extend paths till length threshold is exceeded.
// In case paths coverge within threshold, we break out of while.
while (divergingPathLengh <= this.pathLengthThreshold)
{
// Extension is possible only if end point of all paths has exactly one extension
// In case extensions count is 0, no extensions possible for some path (or)
// if extensions is more than 1, they are diverging further. Not considered a redundant path
if (divergingPaths.Any(p => ((isForwardExtension ^ p.IsSameOrientation) ?
p.Nodes.Last().LeftExtensionNodesCount : p.Nodes.Last().RightExtensionNodesCount) != 1))
{
return;
}

// Extend each path in cluster. While performing path extension
// also keep track of whether they have converged
bool hasConverged = true;
foreach (PathWithOrientation path in divergingPaths)
{
// or possible paths are exhausted
while (divergingPathLength <= this.pathLengthThreshold &&
finishedCount != divergingPaths.Count &&
convergentNode == null)
{
foreach(PathWithOrientation path in divergingPaths) {
if (path.EndReached) {
continue;
}
DeBruijnNode endNode = path.Nodes.Last();
Dictionary<DeBruijnNode, bool> extensions
= (isForwardExtension ^ path.IsSameOrientation) ? endNode.GetLeftExtensionNodesWithOrientation() : endNode.GetRightExtensionNodesWithOrientation();

KeyValuePair<DeBruijnNode, bool> nextNode = extensions.First();
if (path.Nodes.Contains(nextNode.Key))
{
// Loop in path
return;
}
else
{
// Update path orientation
path.IsSameOrientation = !(path.IsSameOrientation ^ nextNode.Value);
path.Nodes.Add(nextNode.Key);

// Check if paths so far are converged
if (hasConverged && nextNode.Key != divergingPaths.First().Nodes.Last())
{
// Last node added is different. Paths do not converge
hasConverged = false;
= path.GrabNextNodesOnLeft ? endNode.GetLeftExtensionNodesWithOrientation() : endNode.GetRightExtensionNodesWithOrientation();

// Extension is possible only if end point of all paths has exactly one extension
// In case extensions count is 0, no extensions possible for some path (or)
// if extensions is more than 1, they are diverging further. Not considered a redundant path
if (extensions.Count > 1 || extensions.Count == 0) {
path.EndReached = true;
finishedCount++;
} else {
// Get next node
KeyValuePair<DeBruijnNode, bool> nextNodeTuple = extensions.First ();
DeBruijnNode nextNode = nextNodeTuple.Key;
// Have we formed a circle? If so, we are done.
// TODO: This is almost certainly very slow for long paths, can replace with Hash and remove possibleEndNodes variable
if (path.Nodes.Contains (nextNode)) {
finishedCount++;
path.EndReached = true;
} else {
// Update path orientation
path.GrabNextNodesOnLeft = !(path.GrabNextNodesOnLeft ^ nextNodeTuple.Value);
path.Nodes.Add (nextNode);

/* Did any other nodes come in to this node from the same direction
* (path or N-1 basepairs shared)? */
var sameInputsCount = path.GrabNextNodesOnLeft ? nextNode.RightExtensionNodesCount : nextNode.LeftExtensionNodesCount;
if (sameInputsCount > 1) {
if (possibleEndNodes.Contains (nextNode)) {
path.EndReached = true;
convergentNode = nextNode;
finishedCount++;
} else {
possibleEndNodes.Add (nextNode);
}
}
}
}
}

divergingPathLengh++;

divergingPathLength++;
// Paths have been extended. Check for convergence
if (hasConverged)
if (convergentNode != null)
{
bool redundantPathFound = ConfirmAndAddRedundantPaths (convergentNode, divergingPaths, redundantPaths);
if (redundantPathFound) {
return;
} else {
/* If we didn't find any paths, it means the nodes came in from different directions, so we
* didn't find a truly convergent node, and the search continues. This should basically never happen.
*/
convergentNode = null;
}
}
}
}

/// <summary>
/// Once we have a set of paths where at least two of these paths converge on the same node.
/// This method checks that the paths are truly convergent (converge from same direction)
/// trims off any excess (indels can lead to unequl paths), and adds it to the redundant path list.
/// </summary>
/// <param name="convergentNode">Convergent node.</param>
/// <param name="divergingPaths">Paths.</param>
/// <param name="redundantPaths">Redundant paths.</param>
private bool ConfirmAndAddRedundantPaths(DeBruijnNode convergentNode, List<PathWithOrientation> divergingPaths,
List<DeBruijnPathList> redundantPaths)
{
bool foundRedundantPaths = false;
/* Now it is possible that two (or more) paths have converged on a node but from
* different directions, so we check for this */

// Get paths that converge on this node
var convergingPaths = divergingPaths.Select (x => new KeyValuePair<PathWithOrientation, int> (x, x.Nodes.IndexOf (convergentNode))).
Where(z => z.Value != -1).ToList ();

// Now trim them all to the appropriate length so convergent node is the end node
// (in case of unequal paths they may differ)
foreach (var pathLocation in convergingPaths) {
var location = pathLocation.Value;
var path = pathLocation.Key;
if (location != path.Nodes.Count - 1) {
path.Nodes.RemoveRange (location + 1, path.Nodes.Count - (location + 1));
}
}

/* Now we have to make a path of all nodes that converge in the same direction */
List<DeBruijnNode>[] sideExtensions = { convergentNode.GetLeftExtensionNodes().ToList(),
convergentNode.GetRightExtensionNodes().ToList()};
foreach (var extensions in sideExtensions) {
var convergeFromSameSide = convergingPaths.Where (p => extensions.Contains (p.Key.Nodes [p.Key.Nodes.Count - 2])).ToList();
if (convergeFromSameSide.Count > 1) {
foundRedundantPaths = true;
// Note: all paths have the same end node.
lock (redundantPaths)
{
// Redundant paths found
redundantPaths.Add(new DeBruijnPathList(divergingPaths.Select(p => new DeBruijnPath(p.Nodes))));
}

return;
}
}
return foundRedundantPaths;
}

}


}
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ public void Build(IEnumerable<ISequence> sequences)
DeBruijnNode node = kmerManager.SetNewOrGetOld(newKmer);

// Need to lock node if doing this in parallel
if (node.KmerCount <= 255)
if (node.KmerCount < UInt32.MaxValue)
{
lock (node)
{
Expand Down
Loading