diff --git a/src/control/cmd/dmg/pretty/storage.go b/src/control/cmd/dmg/pretty/storage.go index 8146d89496e..d888e502906 100644 --- a/src/control/cmd/dmg/pretty/storage.go +++ b/src/control/cmd/dmg/pretty/storage.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -26,7 +26,8 @@ func printHostStorageMapVerbose(hsm control.HostStorageMap, out io.Writer, opts hosts := getPrintHosts(hss.HostSet.RangedString(), opts...) lineBreak := strings.Repeat("-", len(hosts)) fmt.Fprintf(out, "%s\n%s\n%s\n", lineBreak, hosts, lineBreak) - fmt.Fprintf(out, "HugePage Size: %d KB\n", hss.HostStorage.MemInfo.HugepageSizeKiB) + fmt.Fprintf(out, "HugePage Size: %d KB\n\n", + hss.HostStorage.MemInfo.HugepageSizeKiB) if len(hss.HostStorage.ScmNamespaces) == 0 { if err := PrintScmModules(hss.HostStorage.ScmModules, out, opts...); err != nil { return err diff --git a/src/control/cmd/dmg/pretty/storage_nvme.go b/src/control/cmd/dmg/pretty/storage_nvme.go index ec759994ee5..4086499eabe 100644 --- a/src/control/cmd/dmg/pretty/storage_nvme.go +++ b/src/control/cmd/dmg/pretty/storage_nvme.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -168,13 +168,14 @@ func parseNvmeFormatResults(inResults storage.NvmeControllers) storage.NvmeContr parsedResults = append(parsedResults, result) } } - return parsedResults } -func printNvmeFormatResults(ctrlrs storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error { +func printNvmeFormatResults(inCtrlrs storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error { + ctrlrs := parseNvmeFormatResults(inCtrlrs) + iw := txtfmt.NewIndentWriter(out) if len(ctrlrs) == 0 { - fmt.Fprintln(out, "\tNo NVMe devices found") + fmt.Fprintln(iw, "No NVMe devices were formatted") return nil } @@ -188,13 +189,13 @@ func printNvmeFormatResults(ctrlrs storage.NvmeControllers, out io.Writer, opts sort.Slice(ctrlrs, func(i, j int) bool { return ctrlrs[i].PciAddr < ctrlrs[j].PciAddr }) - for _, ctrlr := range parseNvmeFormatResults(ctrlrs) { - row := txtfmt.TableRow{pciTitle: ctrlr.PciAddr} - row[resultTitle] = ctrlr.Info + for _, c := range ctrlrs { + row := txtfmt.TableRow{pciTitle: c.PciAddr} + row[resultTitle] = c.Info roles := "NA" // Assumes that all SMD devices on a controller have the same roles. - if len(ctrlr.SmdDevices) > 0 { - roles = fmt.Sprintf("%s", ctrlr.SmdDevices[0].Roles.String()) + if len(c.SmdDevices) > 0 { + roles = fmt.Sprintf("%s", c.SmdDevices[0].Roles.String()) } row[rolesTitle] = roles @@ -209,8 +210,9 @@ func printNvmeFormatResults(ctrlrs storage.NvmeControllers, out io.Writer, opts func PrintNvmeControllers(controllers storage.NvmeControllers, out io.Writer, opts ...PrintConfigOption) error { w := txtfmt.NewErrWriter(out) + iw := txtfmt.NewIndentWriter(out) if len(controllers) == 0 { - fmt.Fprintln(out, "\tNo NVMe devices found") + fmt.Fprintln(iw, "No NVMe devices found") return w.Err } diff --git a/src/control/cmd/dmg/pretty/storage_scm.go b/src/control/cmd/dmg/pretty/storage_scm.go index cffdaae5b90..bfc2559421f 100644 --- a/src/control/cmd/dmg/pretty/storage_scm.go +++ b/src/control/cmd/dmg/pretty/storage_scm.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2021 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -18,8 +18,9 @@ import ( ) func printScmMountPoints(mountpoints storage.ScmMountPoints, out io.Writer, opts ...PrintConfigOption) error { + iw := txtfmt.NewIndentWriter(out) if len(mountpoints) == 0 { - fmt.Fprintln(out, "\tNo SCM mount results") + fmt.Fprintln(iw, "No SCM mount results") return nil } @@ -48,9 +49,9 @@ func printScmMountPoints(mountpoints storage.ScmMountPoints, out io.Writer, opts // TODO: un-export function when not needed in cmd/daos_server/storage.go func PrintScmModules(modules storage.ScmModules, out io.Writer, opts ...PrintConfigOption) error { w := txtfmt.NewErrWriter(out) - + iw := txtfmt.NewIndentWriter(out) if len(modules) == 0 { - fmt.Fprintln(out, "\tNo SCM modules found") + fmt.Fprintln(iw, "No SCM modules found") return w.Err } @@ -89,9 +90,9 @@ func PrintScmModules(modules storage.ScmModules, out io.Writer, opts ...PrintCon // TODO: un-export function when not needed in cmd/daos_server/storage.go func PrintScmNamespaces(namespaces storage.ScmNamespaces, out io.Writer, opts ...PrintConfigOption) error { w := txtfmt.NewErrWriter(out) - + iw := txtfmt.NewIndentWriter(out) if len(namespaces) == 0 { - fmt.Fprintln(out, "\tNo SCM namespaces found") + fmt.Fprintln(iw, "No SCM namespaces found") return w.Err } diff --git a/src/control/cmd/dmg/pretty/storage_test.go b/src/control/cmd/dmg/pretty/storage_test.go index 4e5279c31bf..04350e9d6cd 100644 --- a/src/control/cmd/dmg/pretty/storage_test.go +++ b/src/control/cmd/dmg/pretty/storage_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -472,7 +472,8 @@ Errors: host1 ----- HugePage Size: 2048 KB - No SCM modules found + + No SCM modules found NVMe PCI Model FW Revision Socket Capacity Role(s) Rank -------- ----- ----------- ------ -------- ------- ---- @@ -501,11 +502,12 @@ Errors: host1 ----- HugePage Size: 2048 KB + SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity ---------- ------ ------------ ------- ------------ -------- 1 1 1 1 1 954 MiB - No NVMe devices found + No NVMe devices found `, }, @@ -535,9 +537,10 @@ Errors: host[1-2] --------- HugePage Size: 2048 KB - No SCM modules found - No NVMe devices found + No SCM modules found + + No NVMe devices found `, }, @@ -561,9 +564,10 @@ HugePage Size: 2048 KB host[1-2] --------- HugePage Size: 2048 KB - No SCM modules found - No NVMe devices found + No SCM modules found + + No NVMe devices found `, }, @@ -583,6 +587,7 @@ HugePage Size: 2048 KB host1 ----- HugePage Size: 2048 KB + SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity ---------- ------ ------------ ------- ------------ -------- 1 1 1 1 1 954 MiB @@ -609,6 +614,7 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host1 ----- HugePage Size: 2048 KB + SCM Namespace Socket Capacity ------------- ------ -------- pmem0 0 1.0 TB @@ -639,6 +645,7 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host[1-2] --------- HugePage Size: 2048 KB + SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity ---------- ------ ------------ ------- ------------ -------- 1 1 1 1 1 954 MiB @@ -669,17 +676,19 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host1 ----- HugePage Size: 2048 KB + SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity ---------- ------ ------------ ------- ------------ -------- 1 1 1 1 1 954 MiB - No NVMe devices found + No NVMe devices found ----- host2 ----- HugePage Size: 2048 KB - No SCM modules found + + No SCM modules found NVMe PCI Model FW Revision Socket Capacity Role(s) Rank -------- ----- ----------- ------ -------- ------- ---- @@ -699,6 +708,7 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host[0-1023] ------------ HugePage Size: 2048 KB + SCM Module Socket Memory Ctrlr Channel Channel Slot Capacity ---------- ------ ------------ ------- ------------ -------- 1 1 1 1 1 954 MiB @@ -737,7 +747,8 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host-[0001-0004] ---------------- HugePage Size: 2048 KB - No SCM modules found + + No SCM modules found NVMe PCI Model FW Revision Socket Capacity Role(s) Rank -------- ----- ----------- ------ -------- ------- ---- @@ -773,7 +784,8 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host-j-[0001-0004] ------------------ HugePage Size: 2048 KB - No SCM modules found + + No SCM modules found NVMe PCI Model FW Revision Socket Capacity Role(s) Rank -------- ----- ----------- ------ -------- ------- ---- @@ -809,6 +821,7 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host[1,3] --------- HugePage Size: 2048 KB + SCM Namespace Socket Capacity ------------- ------ -------- pmem0 0 1.0 TB @@ -823,6 +836,7 @@ NVMe PCI Model FW Revision Socket Capacity Role(s) Rank host[2,4] --------- HugePage Size: 2048 KB + SCM Namespace Socket Capacity ------------- ------ -------- pmem0 0 1.0 TB @@ -1058,6 +1072,41 @@ Format Summary: Hosts SCM Devices NVMe Devices ----- ----------- ------------ host1 2 2 +`, + }, + "1 SCM, NVMe skipped": { + resp: &control.StorageFormatResp{ + HostErrorsResp: control.HostErrorsResp{ + HostErrors: make(control.HostErrorsMap), + }, + HostStorage: func() control.HostStorageMap { + hsm := make(control.HostStorageMap) + hs := &control.HostStorage{ + ScmMountPoints: []*storage.ScmMountPoint{ + { + Info: "success", + Path: "/mnt/0", + }, + }, + NvmeDevices: []*storage.NvmeController{ + { + Info: "skipping", + PciAddr: storage.NilBdevAddress, + }, + }, + } + if err := hsm.Add("host1", hs); err != nil { + t.Fatal(err) + } + return hsm + }(), + }, + expPrintStr: ` + +Format Summary: + Hosts SCM Devices NVMe Devices + ----- ----------- ------------ + host1 1 0 `, }, "2 Hosts, 2 SCM, 2 NVMe; first SCM fails": { @@ -1278,6 +1327,46 @@ NVMe PCI Format Result Role(s) 1 CTL_SUCCESS data,meta,wal 2 CTL_SUCCESS data,meta,wal +`, + }, + "1 SCM, NVMe skipped": { + resp: &control.StorageFormatResp{ + HostErrorsResp: control.HostErrorsResp{ + HostErrors: make(control.HostErrorsMap), + }, + HostStorage: func() control.HostStorageMap { + hsm := make(control.HostStorageMap) + hs := &control.HostStorage{ + ScmMountPoints: []*storage.ScmMountPoint{ + { + Info: "CTL_SUCCESS", + Path: "/mnt/0", + }, + }, + NvmeDevices: []*storage.NvmeController{ + { + Info: "skipping", + PciAddr: storage.NilBdevAddress, + }, + }, + } + if err := hsm.Add("host1", hs); err != nil { + t.Fatal(err) + } + return hsm + }(), + }, + expPrintStr: ` + +----- +host1 +----- +SCM Mount Format Result +--------- ------------- +/mnt/0 CTL_SUCCESS + + No NVMe devices were formatted + `, }, } { diff --git a/src/control/fault/code/codes.go b/src/control/fault/code/codes.go index 4a368aeb5e3..f444453c6b7 100644 --- a/src/control/fault/code/codes.go +++ b/src/control/fault/code/codes.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2018-2023 Intel Corporation. +// (C) Copyright 2018-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -152,6 +152,7 @@ const ( ServerIncompatibleComponents ServerNoCompatibilityInsecure ServerPoolHasContainers + ServerHugepagesDisabled ) // server config fault codes @@ -182,7 +183,7 @@ const ( ServerConfigFaultCallbackEmpty ServerConfigFaultDomainTooManyLayers ServerConfigNrHugepagesOutOfRange - ServerConfigHugepagesDisabled + ServerConfigHugepagesDisabledWithBdevs ServerConfigVMDSettingDuplicate ServerConfigEngineNUMAImbalance ServerConfigControlMetadataNoPath diff --git a/src/control/server/config/faults.go b/src/control/server/config/faults.go index 67148dbd010..31c034d3e5b 100644 --- a/src/control/server/config/faults.go +++ b/src/control/server/config/faults.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2023 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -91,10 +91,10 @@ var ( "the fault domain path may have a maximum of 2 levels below the root", "update either the fault domain ('fault_path' parameter) or callback script ('fault_cb' parameter) and restart the control server", ) - FaultConfigHugepagesDisabled = serverConfigFault( - code.ServerConfigHugepagesDisabled, + FaultConfigHugepagesDisabledWithBdevs = serverConfigFault( + code.ServerConfigHugepagesDisabledWithBdevs, "hugepages cannot be disabled if bdevs have been specified in config", - "remove nr_hugepages parameter from config to have the value automatically calculated", + "either set false (or remove) disable_hugepages parameter or remove nvme storage assignment in config and restart the control server", ) FaultConfigVMDSettingDuplicate = serverConfigFault( code.ServerConfigVMDSettingDuplicate, diff --git a/src/control/server/config/server.go b/src/control/server/config/server.go index 4b6eeb911ab..6242478413e 100644 --- a/src/control/server/config/server.go +++ b/src/control/server/config/server.go @@ -488,7 +488,7 @@ func (cfg *Server) SetNrHugepages(log logging.Logger, mi *common.MemInfo) error } if cfg.DisableHugepages { - return FaultConfigHugepagesDisabled + return FaultConfigHugepagesDisabledWithBdevs } // Calculate minimum number of hugepages for all configured engines. diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go index f4d81d797dd..55082875f4d 100644 --- a/src/control/server/config/server_test.go +++ b/src/control/server/config/server_test.go @@ -865,7 +865,7 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { ), ) }, - expErr: FaultConfigHugepagesDisabled, + expErr: FaultConfigHugepagesDisabledWithBdevs, }, "disabled hugepages; emulated bdevs configured": { extraConfig: func(c *Server) *Server { @@ -885,7 +885,7 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { ), ) }, - expErr: FaultConfigHugepagesDisabled, + expErr: FaultConfigHugepagesDisabledWithBdevs, }, "disabled hugepages; no bdevs configured": { extraConfig: func(c *Server) *Server { diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go index 3b6941ebbc4..71339918876 100644 --- a/src/control/server/ctl_storage_rpc.go +++ b/src/control/server/ctl_storage_rpc.go @@ -29,8 +29,11 @@ import ( ) const ( - msgFormatErr = "instance %d: failure formatting storage, check RPC response for details" - msgNvmeFormatSkip = "NVMe format skipped on instance %d as SCM format did not complete" + msgFormatErr = "instance %d: failure formatting storage, check RPC response for details" + msgNvmeFormatSkip = "NVMe format skipped on instance %d" + msgNvmeFormatSkipHPD = msgNvmeFormatSkip + ", use of hugepages disabled in config" + msgNvmeFormatSkipFail = msgNvmeFormatSkip + ", SCM format failed" + msgNvmeFormatSkipNotDone = msgNvmeFormatSkip + ", SCM was not formatted" // Storage size reserved for storing DAOS metadata stored on SCM device. // // NOTE This storage size value is larger than the minimal size observed (i.e. 36864B), @@ -47,6 +50,11 @@ const ( mdFsScmBytes uint64 = humanize.MiByte ) +var ( + errNoSrvCfg = errors.New("ControlService has no server config") + errNilReq = errors.New("nil request") +) + // newResponseState creates, populates and returns ResponseState. func newResponseState(inErr error, badStatus ctlpb.ResponseStatus, infoMsg string) *ctlpb.ResponseState { rs := new(ctlpb.ResponseState) @@ -231,7 +239,10 @@ func bdevScanAssigned(ctx context.Context, cs *ControlService, req *ctlpb.ScanNv // or not. If running, scan over dRPC. If not running then use engine's storage provider. func bdevScan(ctx context.Context, cs *ControlService, req *ctlpb.ScanNvmeReq, nsps []*ctlpb.ScmNamespace) (resp *ctlpb.ScanNvmeResp, err error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq + } + if cs.srvCfg != nil && cs.srvCfg.DisableHugepages { + return nil, errors.New("cannot scan bdevs if hugepages have been disabled") } defer func() { @@ -692,7 +703,10 @@ func (cs *ControlService) adjustScmSize(resp *ctlpb.ScanScmResp) { // StorageScan discovers non-volatile storage hardware on node. func (cs *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageScanReq) (*ctlpb.StorageScanResp, error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq + } + if cs.srvCfg == nil { + return nil, errNoSrvCfg } resp := new(ctlpb.StorageScanResp) @@ -702,11 +716,18 @@ func (cs *ControlService) StorageScan(ctx context.Context, req *ctlpb.StorageSca } resp.Scm = respScm - respNvme, err := scanBdevs(ctx, cs, req.Nvme, respScm.Namespaces) - if err != nil { - return nil, err + if cs.srvCfg.DisableHugepages { + cs.log.Notice("bdev scan skipped as use of hugepages disabled in config") + resp.Nvme = &ctlpb.ScanNvmeResp{ + State: new(ctlpb.ResponseState), + } + } else { + respNvme, err := scanBdevs(ctx, cs, req.Nvme, respScm.Namespaces) + if err != nil { + return nil, err + } + resp.Nvme = respNvme } - resp.Nvme = respNvme mi, err := cs.getMemInfo() if err != nil { @@ -874,10 +895,17 @@ func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageForma for idx, engine := range req.instances { _, hasError := req.errored[idx] _, skipped := req.skipped[idx] - if hasError || (skipped && !req.mdFormatted) { - // If scm failed to format or was already formatted, skip bdev format. + + // Skip NVMe format if scm was already formatted or failed to format. + skipReason := "" + if hasError { + skipReason = msgNvmeFormatSkipFail + } else if skipped && !req.mdFormatted { + skipReason = msgNvmeFormatSkipNotDone + } + if skipReason != "" { ret := engine.newCret(storage.NilBdevAddress, nil) - ret.State.Info = fmt.Sprintf(msgNvmeFormatSkip, engine.Index()) + ret.State.Info = fmt.Sprintf(skipReason, engine.Index()) resp.Crets = append(resp.Crets, ret) continue } @@ -934,6 +962,13 @@ func formatNvme(ctx context.Context, req formatNvmeReq, resp *ctlpb.StorageForma // Send response containing multiple results of format operations on scm mounts // and nvme controllers. func (cs *ControlService) StorageFormat(ctx context.Context, req *ctlpb.StorageFormatReq) (*ctlpb.StorageFormatResp, error) { + if req == nil { + return nil, errNilReq + } + if cs.srvCfg == nil { + return nil, errNoSrvCfg + } + instances := cs.harness.Instances() resp := new(ctlpb.StorageFormatResp) resp.Mrets = make([]*ctlpb.ScmMountResult, 0, len(instances)) @@ -955,30 +990,46 @@ func (cs *ControlService) StorageFormat(ctx context.Context, req *ctlpb.StorageF instances: instances, getMemInfo: cs.getMemInfo, } + cs.log.Tracef("formatScmReq: %+v", fsr) instanceErrors, instanceSkips, err := formatScm(ctx, fsr, resp) if err != nil { return nil, err } - fnr := formatNvmeReq{ - log: cs.log, - instances: instances, - errored: instanceErrors, - skipped: instanceSkips, - mdFormatted: mdFormatted, + hugepagesDisabled := false + if cs.srvCfg.DisableHugepages { + cs.log.Debug("skipping bdev format as use of hugepages disabled in config") + hugepagesDisabled = true + } else { + fnr := formatNvmeReq{ + log: cs.log, + instances: instances, + errored: instanceErrors, + skipped: instanceSkips, + mdFormatted: mdFormatted, + } + cs.log.Tracef("formatNvmeReq: %+v", fnr) + formatNvme(ctx, fnr, resp) } - formatNvme(ctx, fnr, resp) + + cs.log.Tracef("StorageFormatResp: %+v", resp) // Notify storage ready for instances formatted without error. // Block until all instances have formatted NVMe to avoid // VFIO device or resource busy when starting I/O Engines // because devices have already been claimed during format. - for idx, ei := range instances { + for idx, engine := range instances { + if hugepagesDisabled { + // Populate skip NVMe format results for all engines. + ret := engine.newCret(storage.NilBdevAddress, nil) + ret.State.Info = fmt.Sprintf(msgNvmeFormatSkipHPD, engine.Index()) + resp.Crets = append(resp.Crets, ret) + } if msg, hasError := instanceErrors[idx]; hasError { cs.log.Errorf("instance %d: %s", idx, msg) continue } - ei.NotifyStorageReady() + engine.NotifyStorageReady() } return resp, nil @@ -987,7 +1038,13 @@ func (cs *ControlService) StorageFormat(ctx context.Context, req *ctlpb.StorageF // StorageNvmeRebind rebinds SSD from kernel and binds to user-space to allow DAOS to use it. func (cs *ControlService) StorageNvmeRebind(ctx context.Context, req *ctlpb.NvmeRebindReq) (*ctlpb.NvmeRebindResp, error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq + } + if cs.srvCfg == nil { + return nil, errNoSrvCfg + } + if cs.srvCfg.DisableHugepages { + return nil, FaultHugepagesDisabled } cu, err := user.Current() @@ -1024,7 +1081,13 @@ func (cs *ControlService) StorageNvmeRebind(ctx context.Context, req *ctlpb.Nvme // If StorageTierIndex is set to -1 in request, add the device to the first configured bdev tier. func (cs *ControlService) StorageNvmeAddDevice(ctx context.Context, req *ctlpb.NvmeAddDeviceReq) (resp *ctlpb.NvmeAddDeviceResp, err error) { if req == nil { - return nil, errors.New("nil request") + return nil, errNilReq + } + if cs.srvCfg == nil { + return nil, errNoSrvCfg + } + if cs.srvCfg.DisableHugepages { + return nil, FaultHugepagesDisabled } engines := cs.harness.Instances() diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index 2dfc97cb310..4df8b2b0e65 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2023 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -60,6 +60,7 @@ var ( func TestServer_bdevScan(t *testing.T) { for name, tc := range map[string]struct { req *ctlpb.ScanNvmeReq + disableHPs bool provRes *storage.BdevScanResponse provErr error engTierCfgs []storage.TierConfigs // one per-engine @@ -71,7 +72,12 @@ func TestServer_bdevScan(t *testing.T) { expBackendScanCalls []storage.BdevScanRequest }{ "nil request": { - expErr: errors.New("nil request"), + expErr: errNilReq, + }, + "hugepages disabled": { + req: &ctlpb.ScanNvmeReq{}, + disableHPs: true, + expErr: errors.New("hugepages have been disabled"), }, "scan local; no bdevs in config; scan fails": { req: &ctlpb.ScanNvmeReq{Health: true}, @@ -572,7 +578,8 @@ func TestServer_bdevScan(t *testing.T) { engCfg := engine.MockConfig().WithStorage(tcs...) engCfgs = append(engCfgs, engCfg) } - sCfg := config.DefaultServer().WithEngines(engCfgs...) + sCfg := config.DefaultServer().WithEngines(engCfgs...). + WithDisableHugepages(tc.disableHPs) bmbc := &bdev.MockBackendConfig{ ScanRes: tc.provRes, @@ -643,9 +650,20 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { smbc *scm.MockBackendConfig tierCfgs storage.TierConfigs enginesNotReady bool + disableHPs bool + noSrvCfg bool + nilReq bool expResp *ctlpb.StorageScanResp expErr error }{ + "nil request": { + nilReq: true, + expErr: errNilReq, + }, + "missing server config": { + noSrvCfg: true, + expErr: errNoSrvCfg, + }, "successful scan; scm namespaces": { bdevScanRes: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ @@ -659,6 +677,10 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { GetNamespacesRes: storage.ScmNamespaces{storage.MockScmNamespace()}, }, tierCfgs: storage.TierConfigs{ + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()). + WithScmMountPoint("/mnt/daos0"). + WithScmDeviceList("/dev/pmem0"), storage.NewTierConfig(). WithStorageClass(storage.ClassNvme.String()). WithBdevDeviceList(ctrlr.PciAddr, test.MockPCIAddr(2)), @@ -759,6 +781,29 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { MemInfo: proto.MockPBMemInfo(), }, }, + "hugepages disabled": { + bdevScanRes: &ctlpb.ScanNvmeResp{ + Ctrlrs: proto.NvmeControllers{ + ctrlrPB, + }, + State: new(ctlpb.ResponseState), + }, + smbc: &scm.MockBackendConfig{ + GetModulesRes: storage.ScmModules{storage.MockScmModule()}, + GetNamespacesRes: storage.ScmNamespaces{storage.MockScmNamespace()}, + }, + disableHPs: true, + expResp: &ctlpb.StorageScanResp{ + Nvme: &ctlpb.ScanNvmeResp{ + State: &ctlpb.ResponseState{}, + }, + Scm: &ctlpb.ScanScmResp{ + Namespaces: proto.ScmNamespaces{proto.MockScmNamespace()}, + State: new(ctlpb.ResponseState), + }, + MemInfo: proto.MockPBMemInfo(), + }, + }, "scm module discovery failure": { bdevScanRes: &ctlpb.ScanNvmeResp{ Ctrlrs: proto.NvmeControllers{ @@ -853,7 +898,8 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { engineCfg := engine.MockConfig().WithStorage(tc.tierCfgs...) engineCfgs := []*engine.Config{engineCfg} - sCfg := config.DefaultServer().WithEngines(engineCfgs...) + sCfg := config.DefaultServer().WithEngines(engineCfgs...). + WithDisableHugepages(tc.disableHPs) var cs *ControlService if tc.enginesNotReady { @@ -869,12 +915,15 @@ func TestServer_CtlSvc_StorageScan(t *testing.T) { scanBdevs = bdevScan }() - if tc.req == nil { + if tc.req == nil && !tc.nilReq { tc.req = &ctlpb.StorageScanReq{ Scm: new(ctlpb.ScanScmReq), Nvme: new(ctlpb.ScanNvmeReq), } } + if tc.noSrvCfg { + cs.srvCfg = nil + } resp, err := cs.StorageScan(test.Context(t), tc.req) test.CmpErr(t, tc.expErr, err) @@ -1012,11 +1061,31 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { bmbcs []*bdev.MockBackendConfig awaitTimeout time.Duration getMemInfo func() (*common.MemInfo, error) + disableHPs bool + nilReq bool + noSrvCfg bool expAwaitExit bool expAwaitErr error expResp *ctlpb.StorageFormatResp + expErr error reformat bool // indicates setting of reformat parameter }{ + "nil request": { + nilReq: true, + expResp: &ctlpb.StorageFormatResp{ + Crets: []*ctlpb.NvmeControllerResult{}, + Mrets: []*ctlpb.ScmMountResult{}, + }, + expErr: errNilReq, + }, + "missing server config": { + noSrvCfg: true, + expResp: &ctlpb.StorageFormatResp{ + Crets: []*ctlpb.NvmeControllerResult{}, + Mrets: []*ctlpb.ScmMountResult{}, + }, + expErr: errNoSrvCfg, + }, "ram no nvme": { sMounts: []string{"/mnt/daos"}, sClass: storage.ClassRam, @@ -1047,6 +1116,48 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { }, }, }, + "nvme and ram; use of hugepages disabled": { + sMounts: []string{"/mnt/daos"}, + sClass: storage.ClassRam, + sDevs: []string{"/dev/pmem1"}, // ignored if SCM class is ram + sSize: 6, + bClass: storage.ClassNvme, + bDevs: [][]string{{mockNvmeController0.PciAddr}}, + bmbcs: []*bdev.MockBackendConfig{ + { + ScanRes: &storage.BdevScanResponse{ + Controllers: storage.NvmeControllers{ + mockNvmeController0, + }, + }, + FormatRes: &storage.BdevFormatResponse{ + DeviceResponses: storage.BdevDeviceFormatResponses{ + mockNvmeController0.PciAddr: &storage.BdevDeviceFormatResponse{ + Formatted: true, + }, + }, + }, + }, + }, + disableHPs: true, + expResp: &ctlpb.StorageFormatResp{ + Crets: []*ctlpb.NvmeControllerResult{ + { + PciAddr: storage.NilBdevAddress, + State: &ctlpb.ResponseState{ + Status: ctlpb.ResponseStatus_CTL_SUCCESS, + Info: fmt.Sprintf(msgNvmeFormatSkipHPD, 0), + }, + }, + }, + Mrets: []*ctlpb.ScmMountResult{ + { + Mntpoint: "/mnt/daos", + State: new(ctlpb.ResponseState), + }, + }, + }, + }, "nvme and ram": { sMounts: []string{"/mnt/daos"}, sClass: storage.ClassRam, @@ -1177,7 +1288,8 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { PciAddr: storage.NilBdevAddress, State: &ctlpb.ResponseState{ Status: ctlpb.ResponseStatus_CTL_SUCCESS, - Info: fmt.Sprintf(msgNvmeFormatSkip, 0), + Info: fmt.Sprintf(msgNvmeFormatSkipNotDone, + 0), }, }, }, @@ -1215,7 +1327,8 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { PciAddr: storage.NilBdevAddress, State: &ctlpb.ResponseState{ Status: ctlpb.ResponseStatus_CTL_SUCCESS, - Info: fmt.Sprintf(msgNvmeFormatSkip, 0), + Info: fmt.Sprintf(msgNvmeFormatSkipNotDone, + 0), }, }, }, @@ -1327,7 +1440,8 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { PciAddr: storage.NilBdevAddress, State: &ctlpb.ResponseState{ Status: ctlpb.ResponseStatus_CTL_SUCCESS, - Info: fmt.Sprintf(msgNvmeFormatSkip, 0), + Info: fmt.Sprintf(msgNvmeFormatSkipNotDone, + 0), }, }, }, @@ -1499,7 +1613,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { } } - config := config.DefaultServer() + config := config.DefaultServer().WithDisableHugepages(tc.disableHPs) // validate test parameters if len(tc.sDevs) > 0 { @@ -1674,7 +1788,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { t.Log("rx on awaitCh from unusual awaitStorageReady() returns") test.CmpErr(t, tc.expAwaitErr, err) if !tc.expAwaitExit { - t.Fatal("unexpected exit from awaitStorageReady()") + t.Fatalf("unexpected exit from awaitStorageReady()") } case <-ctx.Done(): t.Logf("context done (%s)", ctx.Err()) @@ -1687,11 +1801,20 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { } } - resp, fmtErr := cs.StorageFormat(test.Context(t), &ctlpb.StorageFormatReq{ - Reformat: tc.reformat, - }) + var req *ctlpb.StorageFormatReq + if !tc.nilReq { + req = &ctlpb.StorageFormatReq{ + Reformat: tc.reformat, + } + } + if tc.noSrvCfg { + cs.srvCfg = nil + } + + resp, fmtErr := cs.StorageFormat(test.Context(t), req) + test.CmpErr(t, tc.expErr, fmtErr) if fmtErr != nil { - t.Fatal(fmtErr) + return } test.AssertEqual(t, len(tc.expResp.Crets), len(resp.Crets), @@ -1733,12 +1856,21 @@ func TestServer_CtlSvc_StorageNvmeRebind(t *testing.T) { for name, tc := range map[string]struct { req *ctlpb.NvmeRebindReq bmbc *bdev.MockBackendConfig + disableHPs bool + noSrvCfg bool expErr error expResp *ctlpb.NvmeRebindResp expPrepCall *storage.BdevPrepareRequest }{ "nil request": { - expErr: errors.New("nil request"), + expErr: errNilReq, + }, + "missing server config": { + req: &ctlpb.NvmeRebindReq{ + PciAddr: test.MockPCIAddr(1), + }, + noSrvCfg: true, + expErr: errNoSrvCfg, }, "failure": { req: &ctlpb.NvmeRebindReq{ @@ -1758,6 +1890,16 @@ func TestServer_CtlSvc_StorageNvmeRebind(t *testing.T) { }, }, }, + "hugepages disabled": { + req: &ctlpb.NvmeRebindReq{ + PciAddr: test.MockPCIAddr(1), + }, + disableHPs: true, + bmbc: &bdev.MockBackendConfig{ + PrepareErr: errors.New("failure"), + }, + expErr: FaultHugepagesDisabled, + }, "success": { req: &ctlpb.NvmeRebindReq{ PciAddr: test.MockPCIAddr(1), @@ -1780,6 +1922,11 @@ func TestServer_CtlSvc_StorageNvmeRebind(t *testing.T) { scm.NewMockProvider(log, nil, nil), mbp, nil) cs := &ControlService{StorageControlService: *scs} + if !tc.noSrvCfg { + cs.srvCfg = config.DefaultServer(). + WithDisableHugepages(tc.disableHPs) + } + resp, err := cs.StorageNvmeRebind(test.Context(t), tc.req) mbb.RLock() @@ -1814,12 +1961,21 @@ func TestServer_CtlSvc_StorageNvmeAddDevice(t *testing.T) { req *ctlpb.NvmeAddDeviceReq bmbc *bdev.MockBackendConfig storageCfgs []storage.TierConfigs + disableHPs bool + noSrvCfg bool expErr error expDevList []string expResp *ctlpb.NvmeAddDeviceResp }{ "nil request": { - expErr: errors.New("nil request"), + expErr: errNilReq, + }, + "missing server config": { + req: &ctlpb.NvmeAddDeviceReq{ + PciAddr: test.MockPCIAddr(1), + }, + noSrvCfg: true, + expErr: errNoSrvCfg, }, "missing engine index 0": { req: &ctlpb.NvmeAddDeviceReq{ @@ -1856,6 +2012,20 @@ func TestServer_CtlSvc_StorageNvmeAddDevice(t *testing.T) { }, expErr: errors.New("no bdev storage tiers"), }, + "hugepages disabled": { + req: &ctlpb.NvmeAddDeviceReq{ + PciAddr: test.MockPCIAddr(1), + StorageTierIndex: -1, + }, + disableHPs: true, + storageCfgs: []storage.TierConfigs{ + { + storage.NewTierConfig(). + WithStorageClass(storage.ClassDcpm.String()), + }, + }, + expErr: FaultHugepagesDisabled, + }, "missing bdev config index 0": { req: &ctlpb.NvmeAddDeviceReq{ PciAddr: test.MockPCIAddr(1), @@ -2095,9 +2265,12 @@ func TestServer_CtlSvc_StorageNvmeAddDevice(t *testing.T) { ec.Index = uint32(idx) engineCfgs = append(engineCfgs, ec) } - serverCfg := config.DefaultServer().WithEngines(engineCfgs...) - + serverCfg := config.DefaultServer().WithEngines(engineCfgs...). + WithDisableHugepages(tc.disableHPs) cs := mockControlService(t, log, serverCfg, tc.bmbc, nil, nil) + if tc.noSrvCfg { + cs.srvCfg = nil + } resp, err := cs.StorageNvmeAddDevice(test.Context(t), tc.req) test.CmpErr(t, tc.expErr, err) diff --git a/src/control/server/faults.go b/src/control/server/faults.go index ffadcaa93c1..daf569fd737 100644 --- a/src/control/server/faults.go +++ b/src/control/server/faults.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -56,6 +56,11 @@ var ( "cannot destroy a pool with existing containers", "retry the operation with the recursive flag set to remove containers along with the pool", ) + FaultHugepagesDisabled = serverFault( + code.ServerHugepagesDisabled, + "the use of hugepages has been disabled in the server config", + "set false (or remove) disable_hugepages parameter in config and reformat storage, then retry the operation", + ) ) func FaultPoolInvalidServiceReps(maxSvcReps uint32) *fault.Fault { diff --git a/src/control/server/instance_storage_rpc.go b/src/control/server/instance_storage_rpc.go index 0b5ab937a39..d6f66be81dd 100644 --- a/src/control/server/instance_storage_rpc.go +++ b/src/control/server/instance_storage_rpc.go @@ -219,7 +219,7 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc // homogeneous hosts. engineRank, err := engine.GetRank() if err != nil { - engine.Debugf("instance %d GetRank: %s", engine.Index(), err.Error()) + return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index()) } nsd := &ctlpb.SmdDevice{ RoleBits: sd.RoleBits, diff --git a/src/control/server/instance_storage_rpc_test.go b/src/control/server/instance_storage_rpc_test.go index 6a15b247e60..bd63a56d1d6 100644 --- a/src/control/server/instance_storage_rpc_test.go +++ b/src/control/server/instance_storage_rpc_test.go @@ -204,22 +204,7 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) { req: ctlpb.ScanNvmeReq{}, rank: -1, smdRes: defSmdScanRes(), - expResp: &ctlpb.ScanNvmeResp{ - Ctrlrs: proto.NvmeControllers{ - func() *ctlpb.NvmeController { - c := proto.MockNvmeController(2) - c.HealthStats = nil - c.SmdDevices = []*ctlpb.SmdDevice{ - { - Rank: uint32(ranklist.NilRank), - RoleBits: storage.BdevRoleAll, - }, - } - return c - }(), - }, - State: new(ctlpb.ResponseState), - }, + expErr: errors.New("nil superblock"), }, "scan over drpc; with health": { req: ctlpb.ScanNvmeReq{Health: true}, diff --git a/src/control/server/server_utils.go b/src/control/server/server_utils.go index 596bb0c50a9..792ffa54541 100644 --- a/src/control/server/server_utils.go +++ b/src/control/server/server_utils.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -523,9 +523,11 @@ func registerEngineEventCallbacks(srv *server, engine *EngineInstance, allStarte engine.OnStorageReady(func(_ context.Context) error { srv.log.Debugf("engine %d: storage ready", engine.Index()) - // Attempt to remove unused hugepages, log error only. - if err := cleanEngineHugepages(srv); err != nil { - srv.log.Errorf(err.Error()) + if !srv.cfg.DisableHugepages { + // Attempt to remove unused hugepages, log error only. + if err := cleanEngineHugepages(srv); err != nil { + srv.log.Errorf(err.Error()) + } } // Retrieve up-to-date meminfo to check resource availability.