perf: optimise solver graph evaluation loop (#6728)

This fix makes the `loop` method in the solver much more efficient for large graphs (lots of actions and dependencies). While graph evaluation in the solver isn't usually the performance bottleneck, the performance impact can be noticeable in larger projects. Here, we use a generator to get leaf nodes ready for processing (whereas previously, we'd traverse the whole pending graph on each loop iteration and load it node-by-node into a new dependency graph data structure, which was costly when iterating over a large graph in a tight loop). We also used a simple boolean flag (dirty) in the solver to track whether any tasks had been requested or completed (successfully or with an error) since the last loop. This was used to avoid evaluating the graph when we know it's not going to result in any changes to the in-progress or pending graphs, or in any new nodes being scheduled for processing. There are further optimizations we can make to the solver (see the comments added in this commit), but they would require careful implementation and testing to avoid possible regressions, so we're leaving them be for now. In a test project that was specifically designed to get the solver to struggle (lots of actions with lots of dependencies), fully resolving the graph before this change took 3 minutes on my laptop, whereas it's down to 55 seconds after these changes. I suspect it could be brought down to 20-30 seconds with the further optimizations outlined in the added code comments, but this might be enough for the time being. Co-authored-by: Steffen Neubauer <[email protected]>
garden-io · Dec 18, 2024 · bd6c6ba · bd6c6ba
1 parent f014cb6
commit bd6c6ba
Show file tree

Hide file tree

Showing 2 changed files with 123 additions and 592 deletions.
diff --git a/core/src/graph/solver.ts b/core/src/graph/solver.ts
@@ -11,10 +11,9 @@ import type { Log } from "../logger/log-entry.js"
 import type { GardenError, GardenErrorParams } from "../exceptions.js"
 import { GraphError, toGardenError } from "../exceptions.js"
 import { uuidv4 } from "../util/random.js"
-import { DependencyGraph } from "./common.js"
 import { Profile } from "../util/profiling.js"
 import { TypedEventEmitter } from "../util/events.js"
-import { groupBy, keyBy } from "lodash-es"
+import { keyBy } from "lodash-es"
 import type { GraphResult, TaskEventBase } from "./results.js"
 import { GraphResults, resultToString } from "./results.js"
 import { gardenEnv } from "../constants.js"
@@ -53,6 +52,7 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
   // Tasks currently running
   private readonly inProgress: WrappedNodes
 
+  private dirty: boolean
   private inLoop: boolean
 
   private log: Log
@@ -68,19 +68,20 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
 
     this.log = garden.log.createLog({ name: "graph-solver" })
     this.inLoop = false
+    this.dirty = false
     this.requestedTasks = {}
     this.nodes = {}
     this.pendingNodes = {}
     this.inProgress = {}
     this.lock = new AsyncLock()
 
     this.on("start", () => {
-      this.log.silly(`GraphSolver: start`)
+      this.log.silly(() => `GraphSolver: start`)
       this.emit("loop", {})
     })
 
     this.on("loop", () => {
-      this.log.silly(`GraphSolver: loop`)
+      this.log.silly(() => `GraphSolver: loop`)
       this.loop()
     })
   }
@@ -214,31 +215,34 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
     })
   }
 
-  private getPendingGraph() {
+  private *getPendingLeaves(): Generator<TaskNode, undefined, undefined> {
     const nodes = Object.values(this.pendingNodes)
-    const graph = new DependencyGraph<TaskNode>()
+    const visitedKeys = new Set<string>()
 
-    const addNode = (node: TaskNode) => {
+    function* addNode(node: TaskNode) {
       const key = node.getKey()
 
-      if (node.isComplete()) {
+      if (node.isComplete() || visitedKeys.has(key)) {
         return
       }
-      graph.addNode(key, node)
+      visitedKeys.add(key)
 
+      // TODO: We could optimize further by making this method a generator too.
       const deps = node.getRemainingDependencies()
+      if (deps.length === 0) {
+        // Leaf node found
+        yield node
+        return
+      }
 
       for (const dep of deps) {
-        addNode(dep)
-        graph.addDependency(key, dep.getKey())
+        yield* addNode(dep)
       }
     }
 
     for (const node of nodes) {
-      addNode(node)
+      yield* addNode(node)
     }
-
-    return graph
   }
 
   start() {
@@ -286,6 +290,12 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
     if (this.inLoop) {
       return
     }
+    if (!this.dirty) {
+      // The graph becomes dirty when a task is requested or completed: This means that either a new task node
+      // may have been added, or that a new result has been set on a node (which will affect the next graph
+      // evaluation).
+      return
+    }
     this.inLoop = true
 
     try {
@@ -299,36 +309,56 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
         }
       }
 
-      const graph = this.getPendingGraph()
+      const leafGenerator = this.getPendingLeaves()
+      let leafCount = 0
 
-      if (graph.size() === 0) {
-        return
-      }
+      const inProgressNodes = Object.values(this.inProgress)
 
-      const leaves = graph.overallOrder(true)
-      const pending = leaves.map((key) => this.nodes[key])
+      // Enforce concurrency limits per task type and concurrency group key
+      const pendingConcurrencyGroupCapacitites: { [key: string]: number } = {}
 
-      const inProgressNodes = Object.values(this.inProgress)
-      const inProgressByGroup = groupBy(inProgressNodes, "type")
-
-      // Enforce concurrency limits per task type
-      const grouped = groupBy(pending, (n) => n.concurrencyGroupKey)
-      const limitedByGroup = Object.values(grouped).flatMap((nodes) => {
-        // Note: We can be sure there is at least one node in the array
-        const groupLimit = nodes[0].concurrencyLimit
-        const inProgress = inProgressByGroup[nodes[0].type] || []
-        return nodes.slice(0, groupLimit - inProgress.length)
-      })
+      this.dirty = false
+
+      // We could do this with a `groupBy`, but this is more efficient (and the loop method is run frequently).
+      for (const node of inProgressNodes) {
+        const groupKey = node.concurrencyGroupKey
+        if (!pendingConcurrencyGroupCapacitites[groupKey]) {
+          pendingConcurrencyGroupCapacitites[groupKey] = 0
+        }
+        pendingConcurrencyGroupCapacitites[groupKey]++
+      }
+      const leavesLimitedByGroup: TaskNode[] = []
+      for (const node of leafGenerator) {
+        if (leafCount >= this.hardConcurrencyLimit - inProgressNodes.length) {
+          // Enforce hard global limit. Note that we never get more leaves than this from `leafGenerator`, which can
+          // save on compute in big graphs.
+          break
+        }
+        leafCount++
+        const groupKey = node.concurrencyGroupKey
+        // Note: All nodes within a given concurrency group should have the same limit.
+        const groupLimit = node.concurrencyLimit
+        if (!pendingConcurrencyGroupCapacitites[groupKey]) {
+          pendingConcurrencyGroupCapacitites[groupKey] = 0
+        }
+        if (pendingConcurrencyGroupCapacitites[groupKey] >= groupLimit) {
+          // We've already reached the concurrency limit for this group, so we won't schedule this node now.
+          continue
+        }
+        // There's capacity available for this group, so we schedule the node
+        leavesLimitedByGroup.push(node)
+        pendingConcurrencyGroupCapacitites[groupKey]++
+      }
+      if (leafCount === 0) {
+        return
+      }
 
-      if (limitedByGroup.length === 0) {
+      if (leavesLimitedByGroup.length === 0) {
         this.emit("loop", {})
         return
       }
 
-      // Enforce hard global limit
-      const nodesToProcess = limitedByGroup
-        .slice(0, this.hardConcurrencyLimit - inProgressNodes.length)
-        .filter((node) => !this.inProgress[node.getKey()])
+      const nodesToProcess = leavesLimitedByGroup.filter((node) => !this.inProgress[node.getKey()])
 
       if (nodesToProcess.length === 0) {
         this.emit("loop", {})
@@ -370,6 +400,7 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
    * Processes a single task to completion, handling errors and providing its result to in-progress task batches.
    */
   private async processNode(node: TaskNode, startedAt: Date) {
+    this.dirty = true
     // Check for missing dependencies by calculating the input version so we can handle the exception
     // as a user error before getting deeper into the control flow (where it would result in an internal
     // error with a noisy stack trace).
@@ -381,6 +412,8 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
     }
 
     this.log.silly(() => `Processing node ${taskStyle(node.getKey())}`)
+    // TODO-performance: Record that a result or an error has become available for this node, use for looping
+    // in evaluateRequests.
 
     try {
       const processResult = await node.execute()
@@ -394,6 +427,7 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
   }
 
   private requestTask(params: TaskRequestParams) {
+    this.dirty = true
     const request = new RequestTaskNode(params)
     if (!this.requestedTasks[params.batchId]) {
       this.requestedTasks[params.batchId] = {}
@@ -403,6 +437,7 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
   }
 
   private evaluateRequests() {
+    // TODO-performance: Only iterate over requests with new results since last loop
     for (const [_batchId, requests] of Object.entries(this.requestedTasks)) {
       for (const request of Object.values(requests)) {
         if (request.isComplete()) {
@@ -430,6 +465,8 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
           this.log.silly(() => `Request ${request.getKey()} has ready status and force=false, no need to process.`)
           this.completeTask({ ...status, node: request })
         } else {
+          // TODO-performance: Add processing nodes for requests only once, during the solve call before looping.
+          // We create exactly one request node for each requested task, so this is known up front.
           const processNode = this.getNode({ type: "process", task, statusOnly: request.statusOnly })
           const result = this.getPendingResult(processNode)
 
@@ -452,6 +489,7 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
   }
 
   private ensurePendingNode(node: TaskNode, dependant: TaskNode) {
+    this.dirty = true
     const key = node.getKey()
     const existing = this.pendingNodes[key]
 
@@ -465,6 +503,7 @@ export class GraphSolver extends TypedEventEmitter<SolverEvents> {
   }
 
   private completeTask(params: CompleteTaskParams & { node: TaskNode }) {
+    this.dirty = true
     const node = params.node
     const result = node.complete(params)
     delete this.inProgress[node.getKey()]