-
Notifications
You must be signed in to change notification settings - Fork 111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
RSDK-9588 - Take reconfigurationLock at a higher level #4645
Changes from all commits
482ff66
cc80a61
61bdba2
32a0ebe
5409d01
ec7cab1
6563cd1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4435,8 +4435,7 @@ func TestRemovingOfflineRemote(t *testing.T) { | |
// prevents that behavior and removes the remote correctly. | ||
func TestRemovingOfflineRemotes(t *testing.T) { | ||
cheukt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Close the robot to stop the background workers from processing any messages to triggerConfig | ||
r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t)) | ||
r.Close(context.Background()) | ||
r := setupLocalRobot(t, context.Background(), &config.Config{}, logging.NewTestLogger(t), withDisableCompleteConfigWorker()) | ||
localRobot := r.(*localRobot) | ||
|
||
// Create a context that we can cancel to similuate the remote connection timeout | ||
|
@@ -4469,6 +4468,9 @@ func TestRemovingOfflineRemotes(t *testing.T) { | |
wg.Add(1) | ||
go func() { | ||
defer wg.Done() | ||
// manually grab the lock as completeConfig doesn't grab a lock | ||
localRobot.reconfigurationLock.Lock() | ||
defer localRobot.reconfigurationLock.Unlock() | ||
localRobot.manager.completeConfig(ctxCompleteConfig, localRobot, false) | ||
}() | ||
|
||
|
@@ -4487,7 +4489,7 @@ func TestRemovingOfflineRemotes(t *testing.T) { | |
// Ensure that the remote is not marked for removal while trying to connect to the remote | ||
remote, ok := localRobot.manager.resources.Node(remoteName) | ||
test.That(t, ok, test.ShouldBeTrue) | ||
test.That(t, remote.MarkedForRemoval(), test.ShouldBeTrue) | ||
test.That(t, remote.MarkedForRemoval(), test.ShouldBeFalse) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure why this was true to begin with, the comment says this should be false. In any case, false makes more sense - we shouldn't have gotten into Reconfigure yet at this point There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pretty scary this was passing pre-PR! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree; how could it have been passing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think pre-PR, the Reconfigure function can progress until after the remote is marked for removal before it needs the lock, which means it can be true |
||
|
||
// Simulate a timeout by canceling the context while trying to connect to the remote | ||
cancelCompleteConfig() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,6 @@ import ( | |
"os" | ||
"reflect" | ||
"strings" | ||
"sync" | ||
"time" | ||
|
||
"github.com/jhump/protoreflect/desc" | ||
|
@@ -55,9 +54,7 @@ type resourceManager struct { | |
opts resourceManagerOptions | ||
logger logging.Logger | ||
|
||
// resourceGraphLock manages access to the resource graph and nodes. If either may change, this lock should be taken. | ||
resourceGraphLock sync.Mutex | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moved the lock up to local_robot instead |
||
viz resource.Visualizer | ||
viz resource.Visualizer | ||
} | ||
|
||
type resourceManagerOptions struct { | ||
|
@@ -320,9 +317,6 @@ func (manager *resourceManager) updateRemoteResourceNames( | |
} | ||
|
||
func (manager *resourceManager) updateRemotesResourceNames(ctx context.Context) bool { | ||
manager.resourceGraphLock.Lock() | ||
defer manager.resourceGraphLock.Unlock() | ||
|
||
anythingChanged := false | ||
for _, name := range manager.resources.Names() { | ||
gNode, _ := manager.resources.Node(name) | ||
|
@@ -552,9 +546,7 @@ func (manager *resourceManager) removeMarkedAndClose( | |
ctx context.Context, | ||
excludeFromClose map[resource.Name]struct{}, | ||
) error { | ||
manager.resourceGraphLock.Lock() | ||
defer func() { | ||
defer manager.resourceGraphLock.Unlock() | ||
if err := manager.viz.SaveSnapshot(manager.resources); err != nil { | ||
manager.logger.Warnw("failed to save graph snapshot", "error", err) | ||
} | ||
|
@@ -608,12 +600,10 @@ func (manager *resourceManager) completeConfig( | |
lr *localRobot, | ||
forceSync bool, | ||
) { | ||
manager.resourceGraphLock.Lock() | ||
defer func() { | ||
if err := manager.viz.SaveSnapshot(manager.resources); err != nil { | ||
manager.logger.Warnw("failed to save graph snapshot", "error", err) | ||
} | ||
manager.resourceGraphLock.Unlock() | ||
}() | ||
|
||
// first handle remotes since they may reveal unresolved dependencies | ||
|
@@ -1127,8 +1117,6 @@ func (manager *resourceManager) updateResources( | |
ctx context.Context, | ||
conf *config.Diff, | ||
) error { | ||
manager.resourceGraphLock.Lock() | ||
defer manager.resourceGraphLock.Unlock() | ||
var allErrs error | ||
|
||
// modules are not added into the resource tree as they belong to the module manager | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we grab the lock here for longer compared to inside resourceManager.Close()
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're just mentioning that as part of the warning in the PR description about "locking for longer/more often than we were locking before"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ya, this is the biggest change/risk based on the amount of extra locking, glad the tests pass haha