Skip to content

Commit

Permalink
add roles detail from config when engines are off-line
Browse files Browse the repository at this point in the history
Features: control
Required-githooks: true

Signed-off-by: Tom Nabarro <[email protected]>
  • Loading branch information
tanabarr committed Dec 12, 2023
1 parent 3e04a2b commit b804a85
Show file tree
Hide file tree
Showing 7 changed files with 293 additions and 81 deletions.
29 changes: 29 additions & 0 deletions src/control/cmd/dmg/pretty/storage_nvme_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ func TestPretty_PrintNVMeController(t *testing.T) {
c.SmdDevices = []*storage.SmdDevice{sd}
return c
}
ctrlrWithNilRank := func(idx int32) *storage.NvmeController {
c := ctrlrWithSmd(idx, 0)
c.SmdDevices[0].Rank = ranklist.NilRank
return c
}
for name, tc := range map[string]struct {
devices storage.NvmeControllers
expPrintStr string
Expand Down Expand Up @@ -68,6 +73,30 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank
-------- ----- ----------- ------ -------- ------- ----
0000:01:00.0 model-1 fwRev-1 1 2.0 TB data 1
0000:02:00.0 model-2 fwRev-2 0 2.0 TB meta,wal 2
`,
},
"controllers with no roles": {
devices: storage.NvmeControllers{
ctrlrWithSmd(1, 0),
ctrlrWithSmd(2, 0),
},
expPrintStr: `
NVMe PCI Model FW Revision Socket Capacity Role(s) Rank
-------- ----- ----------- ------ -------- ------- ----
0000:01:00.0 model-1 fwRev-1 1 2.0 TB N/A 1
0000:02:00.0 model-2 fwRev-2 0 2.0 TB N/A 2
`,
},
"controllers with no rank": {
devices: storage.NvmeControllers{
ctrlrWithNilRank(1),
ctrlrWithNilRank(2),
},
expPrintStr: `
NVMe PCI Model FW Revision Socket Capacity Role(s) Rank
-------- ----- ----------- ------ -------- ------- ----
0000:01:00.0 model-1 fwRev-1 1 2.0 TB N/A None
0000:02:00.0 model-2 fwRev-2 0 2.0 TB N/A None
`,
},
} {
Expand Down
4 changes: 2 additions & 2 deletions src/control/cmd/dmg/storage.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2019-2022 Intel Corporation.
// (C) Copyright 2019-2023 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -49,7 +49,7 @@ func (cmd *storageScanCmd) Execute(_ []string) error {

req := &control.StorageScanReq{
NvmeHealth: cmd.NvmeHealth,
// Don't strip nvme details if verbose or health flags set.
// Strip nvme details if verbose and health flags are unset.
NvmeBasic: !(cmd.Verbose || cmd.NvmeHealth),
}
req.SetHostList(cmd.getHostList())
Expand Down
132 changes: 90 additions & 42 deletions src/control/server/ctl_storage_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ import (
"github.com/daos-stack/daos/src/control/common"
"github.com/daos-stack/daos/src/control/common/proto"
"github.com/daos-stack/daos/src/control/common/proto/convert"
"github.com/daos-stack/daos/src/control/common/proto/ctl"
ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl"
"github.com/daos-stack/daos/src/control/lib/daos"
"github.com/daos-stack/daos/src/control/lib/hardware"
"github.com/daos-stack/daos/src/control/lib/ranklist"
"github.com/daos-stack/daos/src/control/logging"
"github.com/daos-stack/daos/src/control/server/engine"
"github.com/daos-stack/daos/src/control/server/storage"
Expand Down Expand Up @@ -69,8 +69,39 @@ var (

type scanBdevsFn func(storage.BdevScanRequest) (*storage.BdevScanResponse, error)

func ctrlrToPciStr(nc *ctlpb.NvmeController) (string, error) {
pciAddr, err := hardware.NewPCIAddress(nc.GetPciAddr())
if err != nil {
return "", errors.Errorf("Invalid PCI address: %s", err)
}
if pciAddr.IsVMDBackingAddress() {
if pciAddr, err = pciAddr.BackingToVMDAddress(); err != nil {
return "", errors.Errorf("Invalid VMD address: %s", err)
}
}

return pciAddr.String(), nil
}

func findBdevTier(pciAddr string, tcs storage.TierConfigs) *storage.TierConfig {
for _, tc := range tcs {
if !tc.IsBdev() {
continue
}
for _, name := range tc.Bdev.DeviceList.Devices() {
if pciAddr == name {
return tc
}
}
}

return nil
}

// Convert bdev scan results to protobuf response.
func bdevScanToProtoResp(scan scanBdevsFn, req storage.BdevScanRequest) (*ctlpb.ScanNvmeResp, error) {
func bdevScanToProtoResp(scan scanBdevsFn, bdevCfgs storage.TierConfigs) (*ctlpb.ScanNvmeResp, error) {
req := storage.BdevScanRequest{DeviceList: bdevCfgs.Bdevs()}

resp, err := scan(req)
if err != nil {
return nil, err
Expand All @@ -82,18 +113,35 @@ func bdevScanToProtoResp(scan scanBdevsFn, req storage.BdevScanRequest) (*ctlpb.
return nil, err
}

if bdevCfgs.HaveBdevs() {
// Update proto Ctrlrs with role info for offline display.
for _, c := range pbCtrlrs {
pciAddrStr, err := ctrlrToPciStr(c)
if err != nil {
return nil, err
}
bc := findBdevTier(pciAddrStr, bdevCfgs)
if bc == nil {
return nil, errors.Errorf("unknown PCI device, scanned ctrlr %q "+
"not found in cfg", pciAddrStr)
}
if len(c.SmdDevices) != 0 {
return nil, errors.Errorf("scanned ctrlr %q has unexpected smd",
pciAddrStr)
}
c.SmdDevices = append(c.SmdDevices, &ctlpb.SmdDevice{
RoleBits: uint32(bc.Bdev.DeviceRoles.OptionBits),
Rank: uint32(ranklist.NilRank),
})
}
}

return &ctlpb.ScanNvmeResp{
State: new(ctlpb.ResponseState),
Ctrlrs: pbCtrlrs,
}, nil
}

// Scan bdevs through harness's ControlService (not per-engine).
func bdevScanGlobal(cs *ControlService, cfgBdevs *storage.BdevDeviceList) (*ctlpb.ScanNvmeResp, error) {
req := storage.BdevScanRequest{DeviceList: cfgBdevs}
return bdevScanToProtoResp(cs.storage.ScanBdevs, req)
}

// Scan bdevs through each engine and collate response results.
func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace) (*ctlpb.ScanNvmeResp, error) {
var errLast error
Expand All @@ -111,6 +159,8 @@ func bdevScanEngines(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvm
eReq.MetaSize, eReq.RdbSize = ms, rs
}

// If partial number of engines return results, indicate errors for non-ready
// engines whilst returning successful scanmresults.
respEng, err := scanEngineBdevs(ctx, ei, eReq)
if err != nil {
err = errors.Wrapf(err, "instance %d", ei.Index())
Expand Down Expand Up @@ -140,6 +190,7 @@ func bdevScanTrimResults(req *ctlpb.ScanNvmeReq, resp *ctlpb.ScanNvmeResp) *ctlp
pbc.HealthStats = nil
}
if req.GetBasic() {
pbc.SmdDevices = nil
pbc.Serial = ""
pbc.Model = ""
pbc.FwRev = ""
Expand All @@ -159,11 +210,15 @@ func engineHasStarted(instances []Engine) bool {
return false
}

func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace, hasStarted *bool, cfgBdevs *storage.BdevDeviceList) (*ctlpb.ScanNvmeResp, error) {
func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace, hasStarted *bool, bdevCfgs storage.TierConfigs) (*ctlpb.ScanNvmeResp, error) {
*hasStarted = engineHasStarted(cs.harness.Instances())
if !*hasStarted {
cs.log.Debugf("scan bdevs from control service as no engines started")
return bdevScanGlobal(cs, cfgBdevs)
if req.Meta {
return nil, errors.New("meta smd usage info unavailable as engines stopped")
}

return bdevScanToProtoResp(cs.storage.ScanBdevs, bdevCfgs)
}

// Delegate scan to engine instances as soon as one engine with assigned bdevs has started.
Expand All @@ -184,16 +239,21 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n
}
}()

cfgBdevs := getBdevCfgsFromSrvCfg(cs.srvCfg).Bdevs()
bdevCfgs := getBdevCfgsFromSrvCfg(cs.srvCfg)
nrCfgBdevs := bdevCfgs.Bdevs().Len()

if cfgBdevs.Len() == 0 {
if nrCfgBdevs == 0 {
cs.log.Debugf("scan bdevs from control service as no bdevs in cfg")
if req.Meta {
return nil, errors.New("meta smd usage info unavailable as no bdevs in cfg")
}

// No bdevs configured for engines to claim so scan through control service.
resp, err = bdevScanGlobal(cs, cfgBdevs)
resp, err = bdevScanToProtoResp(cs.storage.ScanBdevs, bdevCfgs)
if err != nil {
return nil, err
}

return bdevScanTrimResults(req, resp), nil
}

Expand All @@ -202,26 +262,26 @@ func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, n
// been claimed by SPDK but details are not yet available over dRPC.

var hasStarted bool
resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, cfgBdevs)
resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, bdevCfgs)
if err != nil {
return nil, err
}

// Retry once if global scan returns unexpected number of controllers in case engines
// claimed devices between when started state was checked and scan was executed.
if !hasStarted && len(resp.Ctrlrs) != cfgBdevs.Len() {
if !hasStarted && len(resp.Ctrlrs) != nrCfgBdevs {
cs.log.Debugf("retrying bdev scan as unexpected nr returned, want %d got %d",
cfgBdevs.Len(), len(resp.Ctrlrs))
nrCfgBdevs, len(resp.Ctrlrs))

resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, cfgBdevs)
resp, err = bdevScanAssigned(ctx, cs, req, nsps, &hasStarted, bdevCfgs)
if err != nil {
return nil, err
}
}

if len(resp.Ctrlrs) != cfgBdevs.Len() {
cs.log.Noticef("bdev scan returned unexpected nr, want %d got %d",
cfgBdevs.Len(), len(resp.Ctrlrs))
if len(resp.Ctrlrs) != nrCfgBdevs {
cs.log.Noticef("bdev scan returned unexpected nr, want %d got %d", nrCfgBdevs,
len(resp.Ctrlrs))
}

return bdevScanTrimResults(req, resp), nil
Expand Down Expand Up @@ -281,36 +341,24 @@ func (cs *ControlService) scanScm(ctx context.Context, req *ctlpb.ScanScmReq) (*
}

// Returns the engine configuration managing the given NVMe controller
func (cs *ControlService) getEngineCfgFromNvmeCtl(nc *ctl.NvmeController) (*engine.Config, error) {
pciAddr, err := hardware.NewPCIAddress(nc.GetPciAddr())
func (cs *ControlService) getEngineCfgFromNvmeCtl(nc *ctlpb.NvmeController) (*engine.Config, error) {
pciAddrStr, err := ctrlrToPciStr(nc)
if err != nil {
return nil, errors.Errorf("Invalid PCI address: %s", err)
}
if pciAddr.IsVMDBackingAddress() {
if pciAddr, err = pciAddr.BackingToVMDAddress(); err != nil {
return nil, errors.Errorf("Invalid VMD address: %s", err)
}
return nil, err
}
ctlrAddr := pciAddr.String()

for index := range cs.srvCfg.Engines {
for _, tierCfg := range cs.srvCfg.Engines[index].Storage.Tiers {
if !tierCfg.IsBdev() {
continue
}
for _, devName := range tierCfg.Bdev.DeviceList.Devices() {
if devName == ctlrAddr {
return cs.srvCfg.Engines[index], nil
}
}
if findBdevTier(pciAddrStr, cs.srvCfg.Engines[index].Storage.Tiers) != nil {
return cs.srvCfg.Engines[index], nil
}
}

return nil, errors.Errorf("unknown PCI device %q", pciAddr)
return nil, errors.Errorf("unknown PCI device, scanned ctrlr %q not found in cfg",
pciAddrStr)
}

// Returns the engine configuration managing the given SCM name-space
func (cs *ControlService) getEngineCfgFromScmNsp(nsp *ctl.ScmNamespace) (*engine.Config, error) {
func (cs *ControlService) getEngineCfgFromScmNsp(nsp *ctlpb.ScmNamespace) (*engine.Config, error) {
mountPoint := nsp.GetMount().Path
for index := range cs.srvCfg.Engines {
for _, tierCfg := range cs.srvCfg.Engines[index].Storage.Tiers {
Expand Down Expand Up @@ -383,7 +431,7 @@ func metaRdbComputeSz(cs *ControlService, ei Engine, nsps []*ctlpb.ScmNamespace)
}

type deviceToAdjust struct {
ctlr *ctl.NvmeController
ctlr *ctlpb.NvmeController
idx int
rank uint32
}
Expand Down Expand Up @@ -585,7 +633,7 @@ func (cs *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) {
}
mnt.UsableBytes -= mdBytes

removeControlPlaneMetadata := func(m *ctl.ScmNamespace_Mount) {
removeControlPlaneMetadata := func(m *ctlpb.ScmNamespace_Mount) {
mountPath := m.GetPath()

cs.log.Tracef("Removing control plane metadata (%s, %d bytes) from the usable size of the SCM device %q",
Expand Down
Loading

0 comments on commit b804a85

Please sign in to comment.