From 85432557145b2812349ba102f806739e1d813cca Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 27 Jan 2025 23:05:59 +0000 Subject: [PATCH 1/3] DAOS-16312 control: Always use --force for dmg system stop Allow-unstable-test: true Features: control Signed-off-by: Tom Nabarro --- src/control/cmd/dmg/system.go | 5 +++-- src/control/cmd/dmg/system_test.go | 11 +++++++---- src/control/server/mgmt_system.go | 6 ++++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/control/cmd/dmg/system.go b/src/control/cmd/dmg/system.go index 852244899c5..1892606e94b 100644 --- a/src/control/cmd/dmg/system.go +++ b/src/control/cmd/dmg/system.go @@ -177,7 +177,7 @@ func (cmd *systemEraseCmd) Execute(_ []string) error { // systemStopCmd is the struct representing the command to shutdown DAOS system. type systemStopCmd struct { baseRankListCmd - Force bool `long:"force" description:"Force stop DAOS system members"` + Force bool `long:"force" description:"Currently ignored"` } // Execute is run when systemStopCmd activates. @@ -191,7 +191,8 @@ func (cmd *systemStopCmd) Execute(_ []string) (errOut error) { if err := cmd.validateHostsRanks(); err != nil { return err } - req := &control.SystemStopReq{Force: cmd.Force} + // DAOS-16312: Always use force when stopping ranks. + req := &control.SystemStopReq{Force: true} req.Hosts.Replace(&cmd.Hosts.HostSet) req.Ranks.Replace(&cmd.Ranks.RankSet) diff --git a/src/control/cmd/dmg/system_test.go b/src/control/cmd/dmg/system_test.go index 17bbd614fd8..bf3a4333c0a 100644 --- a/src/control/cmd/dmg/system_test.go +++ b/src/control/cmd/dmg/system_test.go @@ -153,7 +153,7 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with no arguments", "system stop", strings.Join([]string{ - printRequest(t, &control.SystemStopReq{}), + printRequest(t, &control.SystemStopReq{Force: true}), }, " "), nil, }, @@ -169,7 +169,7 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with single rank", "system stop --ranks 0", strings.Join([]string{ - printRequest(t, withRanks(&control.SystemStopReq{}, 0)), + printRequest(t, withRanks(&control.SystemStopReq{Force: true}, 0)), }, " "), nil, }, @@ -177,7 +177,8 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with multiple ranks", "system stop --ranks 0,1,4", strings.Join([]string{ - printRequest(t, withRanks(&control.SystemStopReq{}, 0, 1, 4)), + printRequest(t, + withRanks(&control.SystemStopReq{Force: true}, 0, 1, 4)), }, " "), nil, }, @@ -185,7 +186,9 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with multiple hosts", "system stop --rank-hosts bar9,foo-[0-100]", strings.Join([]string{ - printRequest(t, withHosts(&control.SystemStopReq{}, "foo-[0-100]", "bar9")), + printRequest(t, + withHosts(&control.SystemStopReq{Force: true}, + "foo-[0-100]", "bar9")), }, " "), nil, }, diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index 77da97ff8a9..56e6613865b 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -902,8 +902,10 @@ func (svc *mgmtSvc) SystemStop(ctx context.Context, req *mgmtpb.SystemStopReq) ( return nil, err } - // First phase: Prepare the ranks for shutdown, but only if the request - // is for an unforced full system stop. + // First phase: Prepare the ranks for shutdown, but only if the request is for an unforced + // full system stop. + // DAOS-16312: Note that if force is always specified in request, PrepShutdown will never + // be called. if fReq.FullSystem && !fReq.Force { fReq.Method = control.PrepShutdownRanks fResp, _, err = svc.rpcFanout(ctx, fReq, fResp, true) From 061d971c00e97ea14eb9517d607e8553f12ca545 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Tue, 28 Jan 2025 13:25:06 +0000 Subject: [PATCH 2/3] use sigkill only but suppress exclude unless forced Features: control Signed-off-by: Tom Nabarro --- src/control/cmd/dmg/system.go | 5 ++--- src/control/cmd/dmg/system_test.go | 11 ++++------- src/control/server/ctl_ranks_rpc.go | 16 +++++++++------- src/control/server/ctl_ranks_rpc_test.go | 5 +++-- src/control/server/mgmt_system.go | 2 -- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/control/cmd/dmg/system.go b/src/control/cmd/dmg/system.go index 1892606e94b..852244899c5 100644 --- a/src/control/cmd/dmg/system.go +++ b/src/control/cmd/dmg/system.go @@ -177,7 +177,7 @@ func (cmd *systemEraseCmd) Execute(_ []string) error { // systemStopCmd is the struct representing the command to shutdown DAOS system. type systemStopCmd struct { baseRankListCmd - Force bool `long:"force" description:"Currently ignored"` + Force bool `long:"force" description:"Force stop DAOS system members"` } // Execute is run when systemStopCmd activates. @@ -191,8 +191,7 @@ func (cmd *systemStopCmd) Execute(_ []string) (errOut error) { if err := cmd.validateHostsRanks(); err != nil { return err } - // DAOS-16312: Always use force when stopping ranks. - req := &control.SystemStopReq{Force: true} + req := &control.SystemStopReq{Force: cmd.Force} req.Hosts.Replace(&cmd.Hosts.HostSet) req.Ranks.Replace(&cmd.Ranks.RankSet) diff --git a/src/control/cmd/dmg/system_test.go b/src/control/cmd/dmg/system_test.go index bf3a4333c0a..17bbd614fd8 100644 --- a/src/control/cmd/dmg/system_test.go +++ b/src/control/cmd/dmg/system_test.go @@ -153,7 +153,7 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with no arguments", "system stop", strings.Join([]string{ - printRequest(t, &control.SystemStopReq{Force: true}), + printRequest(t, &control.SystemStopReq{}), }, " "), nil, }, @@ -169,7 +169,7 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with single rank", "system stop --ranks 0", strings.Join([]string{ - printRequest(t, withRanks(&control.SystemStopReq{Force: true}, 0)), + printRequest(t, withRanks(&control.SystemStopReq{}, 0)), }, " "), nil, }, @@ -177,8 +177,7 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with multiple ranks", "system stop --ranks 0,1,4", strings.Join([]string{ - printRequest(t, - withRanks(&control.SystemStopReq{Force: true}, 0, 1, 4)), + printRequest(t, withRanks(&control.SystemStopReq{}, 0, 1, 4)), }, " "), nil, }, @@ -186,9 +185,7 @@ func TestDmg_SystemCommands(t *testing.T) { "system stop with multiple hosts", "system stop --rank-hosts bar9,foo-[0-100]", strings.Join([]string{ - printRequest(t, - withHosts(&control.SystemStopReq{Force: true}, - "foo-[0-100]", "bar9")), + printRequest(t, withHosts(&control.SystemStopReq{}, "foo-[0-100]", "bar9")), }, " "), nil, }, diff --git a/src/control/server/ctl_ranks_rpc.go b/src/control/server/ctl_ranks_rpc.go index ba227742d76..922d878db77 100644 --- a/src/control/server/ctl_ranks_rpc.go +++ b/src/control/server/ctl_ranks_rpc.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -8,10 +9,10 @@ package server import ( "context" - "syscall" "time" "github.com/pkg/errors" + "golang.org/x/sys/unix" "google.golang.org/protobuf/proto" "github.com/daos-stack/daos/src/control/common/proto/convert" @@ -167,10 +168,11 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) ( return nil, errors.New("no ranks specified in request") } - signal := syscall.SIGINT - if req.Force { - signal = syscall.SIGKILL - } + // DAOS-16312 NOTE: SIGINT or SIGTERM are more traditional signals to use to terminate *nix + // processes as they allow the processes to perform cleanup tasks before + // shutdown. SIGKILL is now being used to avoid potential data loss issues + // related to problems in clean shutdown of engines. The rationale maybe + // similar to how STONITH is used in high-availability systems. instances, err := svc.harness.FilterInstancesByRankSet(req.GetRanks()) if err != nil { @@ -185,8 +187,8 @@ func (svc *ControlService) StopRanks(ctx context.Context, req *ctlpb.RanksReq) ( if !ei.IsStarted() { continue } - if err := ei.Stop(signal); err != nil { - return nil, errors.Wrapf(err, "sending %s", signal) + if err := ei.Stop(unix.SIGKILL); err != nil { + return nil, errors.Wrapf(err, "sending kill signal") } } diff --git a/src/control/server/ctl_ranks_rpc_test.go b/src/control/server/ctl_ranks_rpc_test.go index 9355a04bce8..bc2f57806fe 100644 --- a/src/control/server/ctl_ranks_rpc_test.go +++ b/src/control/server/ctl_ranks_rpc_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2025 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -279,7 +280,7 @@ func TestServer_CtlSvc_StopRanks(t *testing.T) { }, "instances successfully stopped": { req: &ctlpb.RanksReq{Ranks: "0-3"}, - expSignalsSent: map[uint32]os.Signal{0: syscall.SIGINT, 1: syscall.SIGINT}, + expSignalsSent: map[uint32]os.Signal{0: syscall.SIGKILL, 1: syscall.SIGKILL}, expResults: []*sharedpb.RankResult{ {Rank: 1, State: msStopped}, {Rank: 2, State: msStopped}, @@ -296,7 +297,7 @@ func TestServer_CtlSvc_StopRanks(t *testing.T) { "instances not stopped in time": { req: &ctlpb.RanksReq{Ranks: "0-3"}, timeout: time.Second, - expSignalsSent: map[uint32]os.Signal{0: syscall.SIGINT, 1: syscall.SIGINT}, + expSignalsSent: map[uint32]os.Signal{0: syscall.SIGKILL, 1: syscall.SIGKILL}, instancesDontStop: true, expErr: errors.New("deadline exceeded"), }, diff --git a/src/control/server/mgmt_system.go b/src/control/server/mgmt_system.go index 56e6613865b..6ebb01ee430 100644 --- a/src/control/server/mgmt_system.go +++ b/src/control/server/mgmt_system.go @@ -904,8 +904,6 @@ func (svc *mgmtSvc) SystemStop(ctx context.Context, req *mgmtpb.SystemStopReq) ( // First phase: Prepare the ranks for shutdown, but only if the request is for an unforced // full system stop. - // DAOS-16312: Note that if force is always specified in request, PrepShutdown will never - // be called. if fReq.FullSystem && !fReq.Force { fReq.Method = control.PrepShutdownRanks fResp, _, err = svc.rpcFanout(ctx, fReq, fResp, true) From 9a49883cd5c4f80a74f6342573ba18b0c7e9533c Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Wed, 29 Jan 2025 17:01:51 +0000 Subject: [PATCH 3/3] fix failure in log_entrye.py ftest Test-tag: vm,ControlLogEntry Allow-unstable-test: true Signed-off-by: Tom Nabarro --- src/tests/ftest/control/log_entry.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/ftest/control/log_entry.py b/src/tests/ftest/control/log_entry.py index c231ef750ec..ce4a83a47c8 100644 --- a/src/tests/ftest/control/log_entry.py +++ b/src/tests/ftest/control/log_entry.py @@ -1,5 +1,6 @@ """ (C) Copyright 2023 Intel Corporation. + (C) Copyright 2025 Hewlett Packard Enterprise Development LP SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -171,7 +172,7 @@ def test_control_log_entry(self): self.log_step('Stop/start 2 random ranks') stop_ranks = self.random.sample(list(self.server_managers[0].ranks), k=2) - expected = [fr'rank {rank}.*exited with 0' for rank in stop_ranks] \ + expected = [fr'rank {rank}.*killed' for rank in stop_ranks] \ + [fr'process.*started on rank {rank}' for rank in stop_ranks] with self.verify_journalctl(expected): self.server_managers[0].stop_ranks(stop_ranks, self.d_log)