Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RSDK-9588 - Take reconfigurationLock at a higher level #4645

Merged
merged 7 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion robot/impl/local_robot.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ type localRobot struct {
cloudConnSvc icloud.ConnectionService
logger logging.Logger
activeBackgroundWorkers sync.WaitGroup

// reconfigurationLock manages access to the resource graph and nodes. If either may change, this lock should be taken.
reconfigurationLock sync.Mutex
// reconfigureWorkers tracks goroutines spawned by reconfiguration functions. we only
// wait on this group in tests to prevent goleak-related failures. however, we do not
// wait on this group outside of testing, since the related goroutines may be running
Expand Down Expand Up @@ -172,7 +175,9 @@ func (r *localRobot) Close(ctx context.Context) error {
err = multierr.Combine(err, r.cloudConnSvc.Close(ctx))
}
if r.manager != nil {
r.reconfigurationLock.Lock()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we grab the lock here for longer compared to inside resourceManager.Close()

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're just mentioning that as part of the warning in the PR description about "locking for longer/more often than we were locking before"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ya, this is the biggest change/risk based on the amount of extra locking, glad the tests pass haha

err = multierr.Combine(err, r.manager.Close(ctx))
r.reconfigurationLock.Unlock()
}
if r.packageManager != nil {
err = multierr.Combine(err, r.packageManager.Close(ctx))
Expand Down Expand Up @@ -307,6 +312,7 @@ func (r *localRobot) completeConfigWorker() {
trigger = "remote"
r.logger.CDebugw(r.closeContext, "configuration attempt triggered by remote")
}
r.reconfigurationLock.Lock()
anyChanges := r.manager.updateRemotesResourceNames(r.closeContext)
if r.manager.anyResourcesNotConfigured() {
anyChanges = true
Expand All @@ -316,6 +322,7 @@ func (r *localRobot) completeConfigWorker() {
r.updateWeakDependents(r.closeContext)
r.logger.CDebugw(r.closeContext, "configuration attempt completed with changes", "trigger", trigger)
}
r.reconfigurationLock.Unlock()
}
}

Expand Down Expand Up @@ -440,6 +447,11 @@ func newWithResources(
if err != nil {
return nil, err
}

// now that we're changing the resource graph, take the reconfigurationLock so
// that other goroutines can't interleave
r.reconfigurationLock.Lock()
defer r.reconfigurationLock.Unlock()
if err := r.manager.resources.AddNode(
web.InternalServiceName,
resource.NewConfiguredGraphNode(resource.Config{}, r.webSvc, builtinModel)); err != nil {
Expand Down Expand Up @@ -497,7 +509,7 @@ func newWithResources(
}, r.activeBackgroundWorkers.Done)
}

r.Reconfigure(ctx, cfg)
r.reconfigure(ctx, cfg, false)

for name, res := range resources {
node := resource.NewConfiguredGraphNode(resource.Config{}, res, unknownModel)
Expand Down Expand Up @@ -529,6 +541,8 @@ func New(
func (r *localRobot) removeOrphanedResources(ctx context.Context,
rNames []resource.Name,
) {
r.reconfigurationLock.Lock()
defer r.reconfigurationLock.Unlock()
r.manager.markResourcesRemoved(rNames, nil)
if err := r.manager.removeMarkedAndClose(ctx, nil); err != nil {
r.logger.CErrorw(ctx, "error removing and closing marked resources",
Expand Down Expand Up @@ -1096,6 +1110,8 @@ func dialRobotClient(
// a best effort to remove no longer in use parts, but if it fails to do so, they could
// possibly leak resources. The given config may be modified by Reconfigure.
func (r *localRobot) Reconfigure(ctx context.Context, newConfig *config.Config) {
r.reconfigurationLock.Lock()
defer r.reconfigurationLock.Unlock()
r.reconfigure(ctx, newConfig, false)
}

Expand Down Expand Up @@ -1361,6 +1377,8 @@ func (r *localRobot) restartSingleModule(ctx context.Context, mod *config.Module
Modified: &config.ModifiedConfigDiff{},
Removed: &config.Config{},
}
r.reconfigurationLock.Lock()
defer r.reconfigurationLock.Unlock()
// note: if !isRunning (i.e. the module is in config but it crashed), putting it in diff.Modified
// results in a no-op; we use .Added instead.
if isRunning {
Expand Down
8 changes: 5 additions & 3 deletions robot/impl/local_robot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4435,8 +4435,7 @@ func TestRemovingOfflineRemote(t *testing.T) {
// prevents that behavior and removes the remote correctly.
func TestRemovingOfflineRemotes(t *testing.T) {
cheukt marked this conversation as resolved.
Show resolved Hide resolved
// Close the robot to stop the background workers from processing any messages to triggerConfig
r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t))
r.Close(context.Background())
r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t), withDisableCompleteConfigWorker())
localRobot := r.(*localRobot)

// Create a context that we can cancel to similuate the remote connection timeout
Expand Down Expand Up @@ -4469,6 +4468,9 @@ func TestRemovingOfflineRemotes(t *testing.T) {
wg.Add(1)
go func() {
defer wg.Done()
// manually grab the lock as completeConfig doesn't grab a lock
localRobot.reconfigurationLock.Lock()
defer localRobot.reconfigurationLock.Unlock()
localRobot.manager.completeConfig(ctxCompleteConfig, localRobot, false)
}()

Expand All @@ -4487,7 +4489,7 @@ func TestRemovingOfflineRemotes(t *testing.T) {
// Ensure that the remote is not marked for removal while trying to connect to the remote
remote, ok := localRobot.manager.resources.Node(remoteName)
test.That(t, ok, test.ShouldBeTrue)
test.That(t, remote.MarkedForRemoval(), test.ShouldBeTrue)
test.That(t, remote.MarkedForRemoval(), test.ShouldBeFalse)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure why this was true to begin with, the comment says this should be false. In any case, false makes more sense - we shouldn't have gotten into Reconfigure yet at this point

Copy link
Member

@dgottlieb dgottlieb Jan 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty scary this was passing pre-PR!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree; how could it have been passing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think pre-PR, the Reconfigure function can progress until after the remote is marked for removal before it needs the lock, which means it can be true


// Simulate a timeout by canceling the context while trying to connect to the remote
cancelCompleteConfig()
Expand Down
14 changes: 1 addition & 13 deletions robot/impl/resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"os"
"reflect"
"strings"
"sync"
"time"

"github.com/jhump/protoreflect/desc"
Expand Down Expand Up @@ -55,9 +54,7 @@ type resourceManager struct {
opts resourceManagerOptions
logger logging.Logger

// resourceGraphLock manages access to the resource graph and nodes. If either may change, this lock should be taken.
resourceGraphLock sync.Mutex
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved the lock up to local_robot instead

viz resource.Visualizer
viz resource.Visualizer
}

type resourceManagerOptions struct {
Expand Down Expand Up @@ -320,9 +317,6 @@ func (manager *resourceManager) updateRemoteResourceNames(
}

func (manager *resourceManager) updateRemotesResourceNames(ctx context.Context) bool {
manager.resourceGraphLock.Lock()
defer manager.resourceGraphLock.Unlock()

anythingChanged := false
for _, name := range manager.resources.Names() {
gNode, _ := manager.resources.Node(name)
Expand Down Expand Up @@ -552,9 +546,7 @@ func (manager *resourceManager) removeMarkedAndClose(
ctx context.Context,
excludeFromClose map[resource.Name]struct{},
) error {
manager.resourceGraphLock.Lock()
defer func() {
defer manager.resourceGraphLock.Unlock()
if err := manager.viz.SaveSnapshot(manager.resources); err != nil {
manager.logger.Warnw("failed to save graph snapshot", "error", err)
}
Expand Down Expand Up @@ -608,12 +600,10 @@ func (manager *resourceManager) completeConfig(
lr *localRobot,
forceSync bool,
) {
manager.resourceGraphLock.Lock()
defer func() {
if err := manager.viz.SaveSnapshot(manager.resources); err != nil {
manager.logger.Warnw("failed to save graph snapshot", "error", err)
}
manager.resourceGraphLock.Unlock()
}()

// first handle remotes since they may reveal unresolved dependencies
Expand Down Expand Up @@ -1127,8 +1117,6 @@ func (manager *resourceManager) updateResources(
ctx context.Context,
conf *config.Diff,
) error {
manager.resourceGraphLock.Lock()
defer manager.resourceGraphLock.Unlock()
var allErrs error

// modules are not added into the resource tree as they belong to the module manager
Expand Down
Loading