Skip to content

Commit

Permalink
issue-1444: Use kernel delay accounting to calculate cpu wait (#1630)
Browse files Browse the repository at this point in the history
  • Loading branch information
antonmyagkov authored Feb 5, 2025
1 parent d4b6efd commit 75c596b
Show file tree
Hide file tree
Showing 42 changed files with 534 additions and 127 deletions.
4 changes: 4 additions & 0 deletions cloud/blockstore/config/diagnostics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package NCloud.NBlockStore.NProto;

option go_package = "github.com/ydb-platform/nbs/cloud/blockstore/config";

import "cloud/storage/core/protos/diagnostics.proto";
import "cloud/storage/core/protos/trace.proto";

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -219,4 +220,7 @@ message TDiagnosticsConfig

// Performance measurements coefficients for local HDD disks.
optional TVolumePerfSettings LocalHDDPerfSettings = 51;

// Type of fetching CPU stats
optional NCloud.NProto.EStatsFetcherType StatsFetcherType = 52;
}
6 changes: 3 additions & 3 deletions cloud/blockstore/libs/daemon/common/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
#include <cloud/storage/core/libs/common/timer.h>
#include <cloud/storage/core/libs/coroutine/executor.h>
#include <cloud/storage/core/libs/daemon/mlock.h>
#include <cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/critical_events.h>
#include <cloud/storage/core/libs/diagnostics/logging.h>
#include <cloud/storage/core/libs/diagnostics/monitoring.h>
Expand Down Expand Up @@ -861,7 +861,7 @@ void TBootstrapBase::Start()
START_KIKIMR_COMPONENT(NotifyService);
START_COMMON_COMPONENT(Monitoring);
START_COMMON_COMPONENT(ProfileLog);
START_KIKIMR_COMPONENT(CgroupStatsFetcher);
START_KIKIMR_COMPONENT(StatsFetcher);
START_COMMON_COMPONENT(DiscoveryService);
START_COMMON_COMPONENT(TraceProcessor);
START_KIKIMR_COMPONENT(TraceSerializer);
Expand Down Expand Up @@ -967,7 +967,7 @@ void TBootstrapBase::Stop()
STOP_KIKIMR_COMPONENT(TraceSerializer);
STOP_COMMON_COMPONENT(TraceProcessor);
STOP_COMMON_COMPONENT(DiscoveryService);
STOP_KIKIMR_COMPONENT(CgroupStatsFetcher);
STOP_KIKIMR_COMPONENT(StatsFetcher);
STOP_COMMON_COMPONENT(ProfileLog);
STOP_COMMON_COMPONENT(Monitoring);
STOP_KIKIMR_COMPONENT(LogbrokerService);
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/daemon/common/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class TBootstrapBase
virtual IStartable* GetTraceSerializer() = 0;
virtual IStartable* GetLogbrokerService() = 0;
virtual IStartable* GetNotifyService() = 0;
virtual IStartable* GetCgroupStatsFetcher() = 0;
virtual IStartable* GetStatsFetcher() = 0;
virtual IStartable* GetIamTokenClient() = 0;
virtual IStartable* GetComputeClient() = 0;
virtual IStartable* GetKmsClient() = 0;
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/daemon/local/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class TBootstrapLocal final
IStartable* GetTraceSerializer() override { return nullptr; }
IStartable* GetLogbrokerService() override { return nullptr; }
IStartable* GetNotifyService() override { return nullptr; }
IStartable* GetCgroupStatsFetcher() override { return nullptr; }
IStartable* GetStatsFetcher() override { return nullptr; }
IStartable* GetIamTokenClient() override { return nullptr; }
IStartable* GetComputeClient() override { return nullptr; }
IStartable* GetKmsClient() override { return nullptr; }
Expand Down
12 changes: 6 additions & 6 deletions cloud/blockstore/libs/daemon/ydb/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
#include <cloud/storage/core/libs/common/proto_helpers.h>
#include <cloud/storage/core/libs/common/task_queue.h>
#include <cloud/storage/core/libs/common/thread_pool.h>
#include <cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/trace_serializer.h>
#include <cloud/storage/core/libs/iam/iface/client.h>
#include <cloud/storage/core/libs/iam/iface/config.h>
Expand Down Expand Up @@ -131,7 +131,7 @@ IStartable* TBootstrapYdb::GetYdbStorage() { return YdbStorage.get(); }
IStartable* TBootstrapYdb::GetTraceSerializer() { return TraceSerializer.get(); }
IStartable* TBootstrapYdb::GetLogbrokerService() { return LogbrokerService.get(); }
IStartable* TBootstrapYdb::GetNotifyService() { return NotifyService.get(); }
IStartable* TBootstrapYdb::GetCgroupStatsFetcher() { return CgroupStatsFetcher.get(); }
IStartable* TBootstrapYdb::GetStatsFetcher() { return StatsFetcher.get(); }
IStartable* TBootstrapYdb::GetIamTokenClient() { return IamTokenClient.get(); }
IStartable* TBootstrapYdb::GetComputeClient() { return ComputeClient.get(); }
IStartable* TBootstrapYdb::GetKmsClient() { return KmsClient.get(); }
Expand Down Expand Up @@ -499,11 +499,11 @@ void TBootstrapYdb::InitKikimrService()

STORAGE_INFO("ProfileLog initialized");

CgroupStatsFetcher = BuildCgroupStatsFetcher(
StatsFetcher = NCloud::NStorage::BuildStatsFetcher(
Configs->DiagnosticsConfig->GetStatsFetcherType(),
Configs->DiagnosticsConfig->GetCpuWaitFilename(),
Log,
logging,
"BLOCKSTORE_CGROUPS");
logging);

if (Configs->StorageConfig->GetBlockDigestsEnabled()) {
if (Configs->StorageConfig->GetUseTestBlockDigestGenerator()) {
Expand Down Expand Up @@ -553,7 +553,7 @@ void TBootstrapYdb::InitKikimrService()
args.LogbrokerService = LogbrokerService;
args.NotifyService = NotifyService;
args.VolumeStats = VolumeStats;
args.CgroupStatsFetcher = CgroupStatsFetcher;
args.StatsFetcher = StatsFetcher;
args.RdmaServer = nullptr;
args.RdmaClient = RdmaClient;
args.Logging = logging;
Expand Down
4 changes: 2 additions & 2 deletions cloud/blockstore/libs/daemon/ydb/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct TBootstrapYdb final
ITraceSerializerPtr TraceSerializer;
NLogbroker::IServicePtr LogbrokerService;
NNotify::IServicePtr NotifyService;
NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher;
NCloud::NStorage::IStatsFetcherPtr StatsFetcher;
NIamClient::IIamTokenClientPtr IamTokenClient;
IComputeClientPtr ComputeClient;
IKmsClientPtr KmsClient;
Expand Down Expand Up @@ -115,7 +115,7 @@ struct TBootstrapYdb final
IStartable* GetTraceSerializer() override;
IStartable* GetLogbrokerService() override;
IStartable* GetNotifyService() override;
IStartable* GetCgroupStatsFetcher() override;
IStartable* GetStatsFetcher() override;
IStartable* GetIamTokenClient() override;
IStartable* GetComputeClient() override;
IStartable* GetKmsClient() override;
Expand Down
10 changes: 10 additions & 0 deletions cloud/blockstore/libs/diagnostics/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ namespace {
xxx(LocalHDDDowntimeThreshold, TDuration, TDuration::Seconds(15) )\
xxx(ReportHistogramAsMultipleCounters, bool, true )\
xxx(ReportHistogramAsSingleCounter, bool, false )\
xxx(StatsFetcherType, NCloud::NProto::EStatsFetcherType, NCloud::NProto::EStatsFetcherType::CGROUP )\
// BLOCKSTORE_DIAGNOSTICS_CONFIG

#define BLOCKSTORE_DIAGNOSTICS_DECLARE_CONFIG(name, type, value) \
Expand Down Expand Up @@ -287,3 +288,12 @@ void Out<NCloud::TRequestThresholds>(
{
OutRequestThresholds(out, value);
}

template <>
void Out<NCloud::NProto::EStatsFetcherType>(
IOutputStream& out,
NCloud::NProto::EStatsFetcherType statsFetcherType)
{
out << NCloud::NProto::EStatsFetcherType_Name(
statsFetcherType);
}
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/diagnostics/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ class TDiagnosticsConfig
TRequestThresholds GetRequestThresholds() const;
EHistogramCounterOptions GetHistogramCounterOptions() const;

NCloud::NProto::EStatsFetcherType GetStatsFetcherType() const;

void Dump(IOutputStream& out) const;
void DumpHtml(IOutputStream& out) const;
};
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/storage/init/server/actorsystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ class TStorageServicesInitializer final
auto volumeBalancerService = CreateVolumeBalancerActor(
Args.StorageConfig,
Args.VolumeStats,
Args.CgroupStatsFetcher,
Args.StatsFetcher,
Args.VolumeBalancerSwitch,
MakeStorageServiceId());

Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/storage/init/server/actorsystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct TServerActorSystemArgs
IVolumeStatsPtr VolumeStats;
NRdma::IServerPtr RdmaServer;
NRdma::IClientPtr RdmaClient;
NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher;
NCloud::NStorage::IStatsFetcherPtr StatsFetcher;
TManuallyPreemptedVolumesPtr PreemptedVolumes;
NNvme::INvmeManagerPtr NvmeManager;
IVolumeBalancerSwitchPtr VolumeBalancerSwitch;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ using namespace NActors;
IActorPtr CreateVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatFetcher,
NCloud::NStorage::IStatsFetcherPtr statFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
NActors::TActorId serviceActorId)
{
return std::make_unique<TVolumeBalancerActor>(
std::move(storageConfig),
std::move(volumeStats),
std::move(cgroupStatFetcher),
std::move(statFetcher),
std::move(volumeBalancerSwitch),
serviceActorId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace NCloud::NBlockStore::NStorage {
NActors::IActorPtr CreateVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatFetcher,
NCloud::NStorage::IStatsFetcherPtr cgroupStatFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
NActors::TActorId serviceActorId);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cloud/blockstore/libs/storage/core/config.h>
#include <cloud/blockstore/libs/storage/core/proto_helpers.h>

#include <cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/stats_fetcher.h>

#include <contrib/ydb/library/actors/core/actor_bootstrapped.h>

Expand Down Expand Up @@ -140,12 +140,12 @@ STFUNC(TRemoteVolumeStatActor::StateWork)
TVolumeBalancerActor::TVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher,
NCloud::NStorage::IStatsFetcherPtr statsFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
TActorId serviceActorId)
: StorageConfig(std::move(storageConfig))
, VolumeStats(std::move(volumeStats))
, CgroupStatsFetcher(std::move(cgroupStatsFetcher))
, StatsFetcher(std::move(statsFetcher))
, VolumeBalancerSwitch(std::move(volumeBalancerSwitch))
, ServiceActorId(serviceActorId)
, State(std::make_unique<TVolumeBalancerState>(StorageConfig))
Expand Down Expand Up @@ -246,7 +246,7 @@ void TVolumeBalancerActor::HandleGetVolumeStatsResponse(
auto now = ctx.Now();

auto interval = (now - LastCpuWaitQuery).MicroSeconds();
auto [cpuWait, error] = CgroupStatsFetcher->GetCpuWait();
auto [cpuWait, error] = StatsFetcher->GetCpuWait();
if (HasError(error)) {
*CpuWaitFailure = 1;
LOG_TRACE_S(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class TVolumeBalancerActor final
private:
const TStorageConfigPtr StorageConfig;
const IVolumeStatsPtr VolumeStats;
const NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher;
const NCloud::NStorage::IStatsFetcherPtr StatsFetcher;
const IVolumeBalancerSwitchPtr VolumeBalancerSwitch;
const NActors::TActorId ServiceActorId;

Expand All @@ -50,7 +50,7 @@ class TVolumeBalancerActor final
TVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher,
NCloud::NStorage::IStatsFetcherPtr statsFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
NActors::TActorId serviceActorId);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <cloud/blockstore/libs/storage/core/public.h>
#include <cloud/blockstore/libs/storage/volume_balancer/volume_balancer.h>

#include <cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/stats_fetcher.h>
#include <cloud/storage/core/libs/features/features_config.h>

#include <library/cpp/testing/unittest/registar.h>
Expand Down Expand Up @@ -194,7 +194,7 @@ struct TVolumeStatsTestMock final

////////////////////////////////////////////////////////////////////////////////

struct TCgroupStatsFetcherMock: public NCloud::NStorage::ICgroupStatsFetcher
struct TStatsFetcherMock: public NCloud::NStorage::IStatsFetcher
{
TResultOrError<TDuration> Value = TDuration::Zero();

Expand Down Expand Up @@ -230,14 +230,14 @@ class TVolumeBalancerTestEnv

public:
std::shared_ptr<TVolumeStatsTestMock> VolumeStats;
std::shared_ptr<TCgroupStatsFetcherMock> Fetcher;
std::shared_ptr<TStatsFetcherMock> Fetcher;

public:
TVolumeBalancerTestEnv()
{
Sender = TestEnv.GetRuntime().AllocateEdgeActor();
VolumeStats = std::make_shared<TVolumeStatsTestMock>();
Fetcher = std::make_shared<TCgroupStatsFetcherMock>();
Fetcher = std::make_shared<TStatsFetcherMock>();
}

TActorId GetEdgeActor() const
Expand Down Expand Up @@ -411,7 +411,7 @@ NFeatures::TFeaturesConfigPtr CreateFeatureConfig(
IActorPtr CreateVolumeBalancerActor(
TVolumeBalancerConfigBuilder& config,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher,
NCloud::NStorage::IStatsFetcherPtr statsFetcher,
TActorId serviceActorId)
{
NProto::TStorageServiceConfig storageConfig = config.Build();
Expand All @@ -425,7 +425,7 @@ IActorPtr CreateVolumeBalancerActor(
CreateFeatureConfig("Balancer", {})
),
std::move(volumeStats),
std::move(cgroupStatsFetcher),
std::move(statsFetcher),
std::move(volumeBalancerSwitch),
std::move(serviceActorId));
}
Expand Down
4 changes: 4 additions & 0 deletions cloud/filestore/config/diagnostics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package NCloud.NFileStore.NProto;

option go_package = "github.com/ydb-platform/nbs/cloud/filestore/config";

import "cloud/storage/core/protos/diagnostics.proto";
import "cloud/storage/core/protos/trace.proto";

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -137,4 +138,7 @@ message TDiagnosticsConfig

// Performance profile for SSD filesystems.
optional TFileSystemPerformanceProfile SSDFileSystemPerformanceProfile = 27;

// Type of fetching CPU stats
optional NCloud.NProto.EStatsFetcherType StatsFetcherType = 28;
}
16 changes: 8 additions & 8 deletions cloud/filestore/libs/daemon/common/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <cloud/storage/core/libs/common/thread_pool.h>
#include <cloud/storage/core/libs/common/timer.h>
#include <cloud/storage/core/libs/daemon/mlock.h>
#include <cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/stats_fetcher.h>
#include <cloud/storage/core/libs/diagnostics/critical_events.h>
#include <cloud/storage/core/libs/diagnostics/logging.h>
#include <cloud/storage/core/libs/diagnostics/monitoring.h>
Expand Down Expand Up @@ -90,7 +90,7 @@ void TBootstrapCommon::Start()
FILESTORE_LOG_START_COMPONENT(BackgroundThreadPool);
FILESTORE_LOG_START_COMPONENT(ProfileLog);
FILESTORE_LOG_START_COMPONENT(RequestStatsUpdater);
FILESTORE_LOG_START_COMPONENT(CgroupStatsFetcher);
FILESTORE_LOG_START_COMPONENT(StatsFetcher);

StartComponents();

Expand Down Expand Up @@ -119,7 +119,7 @@ void TBootstrapCommon::Stop()

StopComponents();

FILESTORE_LOG_STOP_COMPONENT(CgroupStatsFetcher);
FILESTORE_LOG_STOP_COMPONENT(StatsFetcher);
FILESTORE_LOG_STOP_COMPONENT(RequestStatsUpdater);
FILESTORE_LOG_STOP_COMPONENT(ProfileLog);
FILESTORE_LOG_STOP_COMPONENT(BackgroundThreadPool);
Expand Down Expand Up @@ -271,16 +271,16 @@ void TBootstrapCommon::InitActorSystem()
STORAGE_INFO("TraceSerializer initialized");

auto cpuWaitFilename = Configs->DiagnosticsConfig->GetCpuWaitFilename();
CgroupStatsFetcher = BuildCgroupStatsFetcher(
StatsFetcher = NCloud::NStorage::BuildStatsFetcher(
Configs->DiagnosticsConfig->GetStatsFetcherType(),
cpuWaitFilename.empty()
? NCloud::NStorage::BuildCpuWaitStatsFilename(
Configs->DiagnosticsConfig->GetCpuWaitServiceName())
: std::move(cpuWaitFilename),
Log,
logging,
"FILESTORE_CGROUPS");
logging);

STORAGE_INFO("CgroupStatsFetcher initialized");
STORAGE_INFO("StatsFetcher initialized");

NStorage::TActorSystemArgs args;
args.NodeId = nodeId;
Expand All @@ -292,7 +292,7 @@ void TBootstrapCommon::InitActorSystem()
args.DiagnosticsConfig = Configs->DiagnosticsConfig;
args.Metrics = Metrics;
args.UserCounters = UserCounters;
args.CgroupStatsFetcher = CgroupStatsFetcher;
args.StatsFetcher = StatsFetcher;
args.ModuleFactories = ModuleFactories;

ActorSystem = NStorage::CreateActorSystem(args);
Expand Down
2 changes: 1 addition & 1 deletion cloud/filestore/libs/daemon/common/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class TBootstrapCommon
ITaskQueuePtr BackgroundThreadPool;
IProfileLogPtr ProfileLog;
IActorSystemPtr ActorSystem;
NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher;
NCloud::NStorage::IStatsFetcherPtr StatsFetcher;

public:
TBootstrapCommon(
Expand Down
Loading

0 comments on commit 75c596b

Please sign in to comment.