From 75c596b8cb98d291e6ab6dd7b8e1e7be34135c77 Mon Sep 17 00:00:00 2001 From: Anton Myagkov Date: Wed, 5 Feb 2025 10:45:46 +0100 Subject: [PATCH] issue-1444: Use kernel delay accounting to calculate cpu wait (#1630) --- cloud/blockstore/config/diagnostics.proto | 4 + .../libs/daemon/common/bootstrap.cpp | 6 +- .../blockstore/libs/daemon/common/bootstrap.h | 2 +- .../blockstore/libs/daemon/local/bootstrap.h | 2 +- .../blockstore/libs/daemon/ydb/bootstrap.cpp | 12 +- cloud/blockstore/libs/daemon/ydb/bootstrap.h | 4 +- cloud/blockstore/libs/diagnostics/config.cpp | 10 + cloud/blockstore/libs/diagnostics/config.h | 2 + .../libs/storage/init/server/actorsystem.cpp | 2 +- .../libs/storage/init/server/actorsystem.h | 2 +- .../volume_balancer/volume_balancer.cpp | 4 +- .../storage/volume_balancer/volume_balancer.h | 2 +- .../volume_balancer/volume_balancer_actor.cpp | 8 +- .../volume_balancer/volume_balancer_actor.h | 4 +- .../volume_balancer/volume_balancer_ut.cpp | 12 +- cloud/filestore/config/diagnostics.proto | 4 + .../libs/daemon/common/bootstrap.cpp | 16 +- .../filestore/libs/daemon/common/bootstrap.h | 2 +- cloud/filestore/libs/diagnostics/config.cpp | 11 + cloud/filestore/libs/diagnostics/config.h | 2 + .../libs/storage/init/actorsystem.cpp | 2 +- .../filestore/libs/storage/init/actorsystem.h | 2 +- .../libs/storage/service/service.cpp | 4 +- .../filestore/libs/storage/service/service.h | 2 +- .../libs/storage/service/service_actor.cpp | 4 +- .../libs/storage/service/service_actor.h | 4 +- .../service/service_actor_update_stats.cpp | 7 +- .../libs/storage/testlib/test_env.cpp | 4 +- .../libs/diagnostics/cgroup_stats_fetcher.cpp | 64 +---- .../diagnostics/cgroup_stats_fetcher_ut.cpp | 2 +- cloud/storage/core/libs/diagnostics/public.h | 4 +- .../core/libs/diagnostics/qemu_ut/bin/ya.make | 9 + .../core/libs/diagnostics/qemu_ut/test.py | 9 + .../core/libs/diagnostics/qemu_ut/ya.make | 16 ++ .../core/libs/diagnostics/stats_fetcher.cpp | 79 ++++++ ...cgroup_stats_fetcher.h => stats_fetcher.h} | 24 +- .../libs/diagnostics/task_stats_fetcher.cpp | 252 ++++++++++++++++++ .../diagnostics/task_stats_fetcher_ut.cpp | 39 +++ cloud/storage/core/libs/diagnostics/ya.make | 5 +- cloud/storage/core/protos/diagnostics.proto | 15 ++ cloud/storage/core/protos/ya.make | 1 + .../tools/analytics/cpu-wait-monitor/main.cpp | 2 +- 42 files changed, 534 insertions(+), 127 deletions(-) create mode 100644 cloud/storage/core/libs/diagnostics/qemu_ut/bin/ya.make create mode 100644 cloud/storage/core/libs/diagnostics/qemu_ut/test.py create mode 100644 cloud/storage/core/libs/diagnostics/qemu_ut/ya.make create mode 100644 cloud/storage/core/libs/diagnostics/stats_fetcher.cpp rename cloud/storage/core/libs/diagnostics/{cgroup_stats_fetcher.h => stats_fetcher.h} (60%) create mode 100644 cloud/storage/core/libs/diagnostics/task_stats_fetcher.cpp create mode 100644 cloud/storage/core/libs/diagnostics/task_stats_fetcher_ut.cpp create mode 100644 cloud/storage/core/protos/diagnostics.proto diff --git a/cloud/blockstore/config/diagnostics.proto b/cloud/blockstore/config/diagnostics.proto index cee11ee8635..47a4d984941 100644 --- a/cloud/blockstore/config/diagnostics.proto +++ b/cloud/blockstore/config/diagnostics.proto @@ -4,6 +4,7 @@ package NCloud.NBlockStore.NProto; option go_package = "github.com/ydb-platform/nbs/cloud/blockstore/config"; +import "cloud/storage/core/protos/diagnostics.proto"; import "cloud/storage/core/protos/trace.proto"; //////////////////////////////////////////////////////////////////////////////// @@ -219,4 +220,7 @@ message TDiagnosticsConfig // Performance measurements coefficients for local HDD disks. optional TVolumePerfSettings LocalHDDPerfSettings = 51; + + // Type of fetching CPU stats + optional NCloud.NProto.EStatsFetcherType StatsFetcherType = 52; } diff --git a/cloud/blockstore/libs/daemon/common/bootstrap.cpp b/cloud/blockstore/libs/daemon/common/bootstrap.cpp index f06be81fef5..7f86a2ddaf1 100644 --- a/cloud/blockstore/libs/daemon/common/bootstrap.cpp +++ b/cloud/blockstore/libs/daemon/common/bootstrap.cpp @@ -86,7 +86,7 @@ #include #include #include -#include +#include #include #include #include @@ -861,7 +861,7 @@ void TBootstrapBase::Start() START_KIKIMR_COMPONENT(NotifyService); START_COMMON_COMPONENT(Monitoring); START_COMMON_COMPONENT(ProfileLog); - START_KIKIMR_COMPONENT(CgroupStatsFetcher); + START_KIKIMR_COMPONENT(StatsFetcher); START_COMMON_COMPONENT(DiscoveryService); START_COMMON_COMPONENT(TraceProcessor); START_KIKIMR_COMPONENT(TraceSerializer); @@ -967,7 +967,7 @@ void TBootstrapBase::Stop() STOP_KIKIMR_COMPONENT(TraceSerializer); STOP_COMMON_COMPONENT(TraceProcessor); STOP_COMMON_COMPONENT(DiscoveryService); - STOP_KIKIMR_COMPONENT(CgroupStatsFetcher); + STOP_KIKIMR_COMPONENT(StatsFetcher); STOP_COMMON_COMPONENT(ProfileLog); STOP_COMMON_COMPONENT(Monitoring); STOP_KIKIMR_COMPONENT(LogbrokerService); diff --git a/cloud/blockstore/libs/daemon/common/bootstrap.h b/cloud/blockstore/libs/daemon/common/bootstrap.h index 79bf75c2329..8df63dfe345 100644 --- a/cloud/blockstore/libs/daemon/common/bootstrap.h +++ b/cloud/blockstore/libs/daemon/common/bootstrap.h @@ -108,7 +108,7 @@ class TBootstrapBase virtual IStartable* GetTraceSerializer() = 0; virtual IStartable* GetLogbrokerService() = 0; virtual IStartable* GetNotifyService() = 0; - virtual IStartable* GetCgroupStatsFetcher() = 0; + virtual IStartable* GetStatsFetcher() = 0; virtual IStartable* GetIamTokenClient() = 0; virtual IStartable* GetComputeClient() = 0; virtual IStartable* GetKmsClient() = 0; diff --git a/cloud/blockstore/libs/daemon/local/bootstrap.h b/cloud/blockstore/libs/daemon/local/bootstrap.h index 6c1c428a825..9a3e1ab4491 100644 --- a/cloud/blockstore/libs/daemon/local/bootstrap.h +++ b/cloud/blockstore/libs/daemon/local/bootstrap.h @@ -32,7 +32,7 @@ class TBootstrapLocal final IStartable* GetTraceSerializer() override { return nullptr; } IStartable* GetLogbrokerService() override { return nullptr; } IStartable* GetNotifyService() override { return nullptr; } - IStartable* GetCgroupStatsFetcher() override { return nullptr; } + IStartable* GetStatsFetcher() override { return nullptr; } IStartable* GetIamTokenClient() override { return nullptr; } IStartable* GetComputeClient() override { return nullptr; } IStartable* GetKmsClient() override { return nullptr; } diff --git a/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp b/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp index 941075b4573..5d1d7dabfc5 100644 --- a/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp +++ b/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include #include #include @@ -131,7 +131,7 @@ IStartable* TBootstrapYdb::GetYdbStorage() { return YdbStorage.get(); } IStartable* TBootstrapYdb::GetTraceSerializer() { return TraceSerializer.get(); } IStartable* TBootstrapYdb::GetLogbrokerService() { return LogbrokerService.get(); } IStartable* TBootstrapYdb::GetNotifyService() { return NotifyService.get(); } -IStartable* TBootstrapYdb::GetCgroupStatsFetcher() { return CgroupStatsFetcher.get(); } +IStartable* TBootstrapYdb::GetStatsFetcher() { return StatsFetcher.get(); } IStartable* TBootstrapYdb::GetIamTokenClient() { return IamTokenClient.get(); } IStartable* TBootstrapYdb::GetComputeClient() { return ComputeClient.get(); } IStartable* TBootstrapYdb::GetKmsClient() { return KmsClient.get(); } @@ -499,11 +499,11 @@ void TBootstrapYdb::InitKikimrService() STORAGE_INFO("ProfileLog initialized"); - CgroupStatsFetcher = BuildCgroupStatsFetcher( + StatsFetcher = NCloud::NStorage::BuildStatsFetcher( + Configs->DiagnosticsConfig->GetStatsFetcherType(), Configs->DiagnosticsConfig->GetCpuWaitFilename(), Log, - logging, - "BLOCKSTORE_CGROUPS"); + logging); if (Configs->StorageConfig->GetBlockDigestsEnabled()) { if (Configs->StorageConfig->GetUseTestBlockDigestGenerator()) { @@ -553,7 +553,7 @@ void TBootstrapYdb::InitKikimrService() args.LogbrokerService = LogbrokerService; args.NotifyService = NotifyService; args.VolumeStats = VolumeStats; - args.CgroupStatsFetcher = CgroupStatsFetcher; + args.StatsFetcher = StatsFetcher; args.RdmaServer = nullptr; args.RdmaClient = RdmaClient; args.Logging = logging; diff --git a/cloud/blockstore/libs/daemon/ydb/bootstrap.h b/cloud/blockstore/libs/daemon/ydb/bootstrap.h index b87ed13bbb6..74fa0c66efb 100644 --- a/cloud/blockstore/libs/daemon/ydb/bootstrap.h +++ b/cloud/blockstore/libs/daemon/ydb/bootstrap.h @@ -87,7 +87,7 @@ struct TBootstrapYdb final ITraceSerializerPtr TraceSerializer; NLogbroker::IServicePtr LogbrokerService; NNotify::IServicePtr NotifyService; - NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher; + NCloud::NStorage::IStatsFetcherPtr StatsFetcher; NIamClient::IIamTokenClientPtr IamTokenClient; IComputeClientPtr ComputeClient; IKmsClientPtr KmsClient; @@ -115,7 +115,7 @@ struct TBootstrapYdb final IStartable* GetTraceSerializer() override; IStartable* GetLogbrokerService() override; IStartable* GetNotifyService() override; - IStartable* GetCgroupStatsFetcher() override; + IStartable* GetStatsFetcher() override; IStartable* GetIamTokenClient() override; IStartable* GetComputeClient() override; IStartable* GetKmsClient() override; diff --git a/cloud/blockstore/libs/diagnostics/config.cpp b/cloud/blockstore/libs/diagnostics/config.cpp index 27d7618ede6..70c0698e597 100644 --- a/cloud/blockstore/libs/diagnostics/config.cpp +++ b/cloud/blockstore/libs/diagnostics/config.cpp @@ -56,6 +56,7 @@ namespace { xxx(LocalHDDDowntimeThreshold, TDuration, TDuration::Seconds(15) )\ xxx(ReportHistogramAsMultipleCounters, bool, true )\ xxx(ReportHistogramAsSingleCounter, bool, false )\ + xxx(StatsFetcherType, NCloud::NProto::EStatsFetcherType, NCloud::NProto::EStatsFetcherType::CGROUP )\ // BLOCKSTORE_DIAGNOSTICS_CONFIG #define BLOCKSTORE_DIAGNOSTICS_DECLARE_CONFIG(name, type, value) \ @@ -287,3 +288,12 @@ void Out( { OutRequestThresholds(out, value); } + +template <> +void Out( + IOutputStream& out, + NCloud::NProto::EStatsFetcherType statsFetcherType) +{ + out << NCloud::NProto::EStatsFetcherType_Name( + statsFetcherType); +} diff --git a/cloud/blockstore/libs/diagnostics/config.h b/cloud/blockstore/libs/diagnostics/config.h index cefd0acaf70..47616ec0328 100644 --- a/cloud/blockstore/libs/diagnostics/config.h +++ b/cloud/blockstore/libs/diagnostics/config.h @@ -159,6 +159,8 @@ class TDiagnosticsConfig TRequestThresholds GetRequestThresholds() const; EHistogramCounterOptions GetHistogramCounterOptions() const; + NCloud::NProto::EStatsFetcherType GetStatsFetcherType() const; + void Dump(IOutputStream& out) const; void DumpHtml(IOutputStream& out) const; }; diff --git a/cloud/blockstore/libs/storage/init/server/actorsystem.cpp b/cloud/blockstore/libs/storage/init/server/actorsystem.cpp index ea5b107cb3b..92f636ade37 100644 --- a/cloud/blockstore/libs/storage/init/server/actorsystem.cpp +++ b/cloud/blockstore/libs/storage/init/server/actorsystem.cpp @@ -308,7 +308,7 @@ class TStorageServicesInitializer final auto volumeBalancerService = CreateVolumeBalancerActor( Args.StorageConfig, Args.VolumeStats, - Args.CgroupStatsFetcher, + Args.StatsFetcher, Args.VolumeBalancerSwitch, MakeStorageServiceId()); diff --git a/cloud/blockstore/libs/storage/init/server/actorsystem.h b/cloud/blockstore/libs/storage/init/server/actorsystem.h index 2c30e6edf1e..b8c8c8dff68 100644 --- a/cloud/blockstore/libs/storage/init/server/actorsystem.h +++ b/cloud/blockstore/libs/storage/init/server/actorsystem.h @@ -61,7 +61,7 @@ struct TServerActorSystemArgs IVolumeStatsPtr VolumeStats; NRdma::IServerPtr RdmaServer; NRdma::IClientPtr RdmaClient; - NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher; + NCloud::NStorage::IStatsFetcherPtr StatsFetcher; TManuallyPreemptedVolumesPtr PreemptedVolumes; NNvme::INvmeManagerPtr NvmeManager; IVolumeBalancerSwitchPtr VolumeBalancerSwitch; diff --git a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.cpp b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.cpp index 68f688cdae6..d4a7cf2d845 100644 --- a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.cpp +++ b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.cpp @@ -11,14 +11,14 @@ using namespace NActors; IActorPtr CreateVolumeBalancerActor( TStorageConfigPtr storageConfig, IVolumeStatsPtr volumeStats, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatFetcher, + NCloud::NStorage::IStatsFetcherPtr statFetcher, IVolumeBalancerSwitchPtr volumeBalancerSwitch, NActors::TActorId serviceActorId) { return std::make_unique( std::move(storageConfig), std::move(volumeStats), - std::move(cgroupStatFetcher), + std::move(statFetcher), std::move(volumeBalancerSwitch), serviceActorId); } diff --git a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.h b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.h index 539448faf4a..289b4e76734 100644 --- a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.h +++ b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer.h @@ -20,7 +20,7 @@ namespace NCloud::NBlockStore::NStorage { NActors::IActorPtr CreateVolumeBalancerActor( TStorageConfigPtr storageConfig, IVolumeStatsPtr volumeStats, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatFetcher, + NCloud::NStorage::IStatsFetcherPtr cgroupStatFetcher, IVolumeBalancerSwitchPtr volumeBalancerSwitch, NActors::TActorId serviceActorId); diff --git a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp index fb118beadac..a94d01792c9 100644 --- a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp +++ b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.cpp @@ -12,7 +12,7 @@ #include #include -#include +#include #include @@ -140,12 +140,12 @@ STFUNC(TRemoteVolumeStatActor::StateWork) TVolumeBalancerActor::TVolumeBalancerActor( TStorageConfigPtr storageConfig, IVolumeStatsPtr volumeStats, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher, + NCloud::NStorage::IStatsFetcherPtr statsFetcher, IVolumeBalancerSwitchPtr volumeBalancerSwitch, TActorId serviceActorId) : StorageConfig(std::move(storageConfig)) , VolumeStats(std::move(volumeStats)) - , CgroupStatsFetcher(std::move(cgroupStatsFetcher)) + , StatsFetcher(std::move(statsFetcher)) , VolumeBalancerSwitch(std::move(volumeBalancerSwitch)) , ServiceActorId(serviceActorId) , State(std::make_unique(StorageConfig)) @@ -246,7 +246,7 @@ void TVolumeBalancerActor::HandleGetVolumeStatsResponse( auto now = ctx.Now(); auto interval = (now - LastCpuWaitQuery).MicroSeconds(); - auto [cpuWait, error] = CgroupStatsFetcher->GetCpuWait(); + auto [cpuWait, error] = StatsFetcher->GetCpuWait(); if (HasError(error)) { *CpuWaitFailure = 1; LOG_TRACE_S( diff --git a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.h b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.h index 6f965004ad3..b18de7ec099 100644 --- a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.h +++ b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_actor.h @@ -28,7 +28,7 @@ class TVolumeBalancerActor final private: const TStorageConfigPtr StorageConfig; const IVolumeStatsPtr VolumeStats; - const NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher; + const NCloud::NStorage::IStatsFetcherPtr StatsFetcher; const IVolumeBalancerSwitchPtr VolumeBalancerSwitch; const NActors::TActorId ServiceActorId; @@ -50,7 +50,7 @@ class TVolumeBalancerActor final TVolumeBalancerActor( TStorageConfigPtr storageConfig, IVolumeStatsPtr volumeStats, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher, + NCloud::NStorage::IStatsFetcherPtr statsFetcher, IVolumeBalancerSwitchPtr volumeBalancerSwitch, NActors::TActorId serviceActorId); diff --git a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp index a856c4dbd41..efac9add304 100644 --- a/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp +++ b/cloud/blockstore/libs/storage/volume_balancer/volume_balancer_ut.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include @@ -194,7 +194,7 @@ struct TVolumeStatsTestMock final //////////////////////////////////////////////////////////////////////////////// -struct TCgroupStatsFetcherMock: public NCloud::NStorage::ICgroupStatsFetcher +struct TStatsFetcherMock: public NCloud::NStorage::IStatsFetcher { TResultOrError Value = TDuration::Zero(); @@ -230,14 +230,14 @@ class TVolumeBalancerTestEnv public: std::shared_ptr VolumeStats; - std::shared_ptr Fetcher; + std::shared_ptr Fetcher; public: TVolumeBalancerTestEnv() { Sender = TestEnv.GetRuntime().AllocateEdgeActor(); VolumeStats = std::make_shared(); - Fetcher = std::make_shared(); + Fetcher = std::make_shared(); } TActorId GetEdgeActor() const @@ -411,7 +411,7 @@ NFeatures::TFeaturesConfigPtr CreateFeatureConfig( IActorPtr CreateVolumeBalancerActor( TVolumeBalancerConfigBuilder& config, IVolumeStatsPtr volumeStats, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher, + NCloud::NStorage::IStatsFetcherPtr statsFetcher, TActorId serviceActorId) { NProto::TStorageServiceConfig storageConfig = config.Build(); @@ -425,7 +425,7 @@ IActorPtr CreateVolumeBalancerActor( CreateFeatureConfig("Balancer", {}) ), std::move(volumeStats), - std::move(cgroupStatsFetcher), + std::move(statsFetcher), std::move(volumeBalancerSwitch), std::move(serviceActorId)); } diff --git a/cloud/filestore/config/diagnostics.proto b/cloud/filestore/config/diagnostics.proto index 1e5d689b33a..a9c7f0c0e6e 100644 --- a/cloud/filestore/config/diagnostics.proto +++ b/cloud/filestore/config/diagnostics.proto @@ -4,6 +4,7 @@ package NCloud.NFileStore.NProto; option go_package = "github.com/ydb-platform/nbs/cloud/filestore/config"; +import "cloud/storage/core/protos/diagnostics.proto"; import "cloud/storage/core/protos/trace.proto"; //////////////////////////////////////////////////////////////////////////////// @@ -137,4 +138,7 @@ message TDiagnosticsConfig // Performance profile for SSD filesystems. optional TFileSystemPerformanceProfile SSDFileSystemPerformanceProfile = 27; + + // Type of fetching CPU stats + optional NCloud.NProto.EStatsFetcherType StatsFetcherType = 28; } diff --git a/cloud/filestore/libs/daemon/common/bootstrap.cpp b/cloud/filestore/libs/daemon/common/bootstrap.cpp index b11e5b0d6d0..725b06b5e27 100644 --- a/cloud/filestore/libs/daemon/common/bootstrap.cpp +++ b/cloud/filestore/libs/daemon/common/bootstrap.cpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -90,7 +90,7 @@ void TBootstrapCommon::Start() FILESTORE_LOG_START_COMPONENT(BackgroundThreadPool); FILESTORE_LOG_START_COMPONENT(ProfileLog); FILESTORE_LOG_START_COMPONENT(RequestStatsUpdater); - FILESTORE_LOG_START_COMPONENT(CgroupStatsFetcher); + FILESTORE_LOG_START_COMPONENT(StatsFetcher); StartComponents(); @@ -119,7 +119,7 @@ void TBootstrapCommon::Stop() StopComponents(); - FILESTORE_LOG_STOP_COMPONENT(CgroupStatsFetcher); + FILESTORE_LOG_STOP_COMPONENT(StatsFetcher); FILESTORE_LOG_STOP_COMPONENT(RequestStatsUpdater); FILESTORE_LOG_STOP_COMPONENT(ProfileLog); FILESTORE_LOG_STOP_COMPONENT(BackgroundThreadPool); @@ -271,16 +271,16 @@ void TBootstrapCommon::InitActorSystem() STORAGE_INFO("TraceSerializer initialized"); auto cpuWaitFilename = Configs->DiagnosticsConfig->GetCpuWaitFilename(); - CgroupStatsFetcher = BuildCgroupStatsFetcher( + StatsFetcher = NCloud::NStorage::BuildStatsFetcher( + Configs->DiagnosticsConfig->GetStatsFetcherType(), cpuWaitFilename.empty() ? NCloud::NStorage::BuildCpuWaitStatsFilename( Configs->DiagnosticsConfig->GetCpuWaitServiceName()) : std::move(cpuWaitFilename), Log, - logging, - "FILESTORE_CGROUPS"); + logging); - STORAGE_INFO("CgroupStatsFetcher initialized"); + STORAGE_INFO("StatsFetcher initialized"); NStorage::TActorSystemArgs args; args.NodeId = nodeId; @@ -292,7 +292,7 @@ void TBootstrapCommon::InitActorSystem() args.DiagnosticsConfig = Configs->DiagnosticsConfig; args.Metrics = Metrics; args.UserCounters = UserCounters; - args.CgroupStatsFetcher = CgroupStatsFetcher; + args.StatsFetcher = StatsFetcher; args.ModuleFactories = ModuleFactories; ActorSystem = NStorage::CreateActorSystem(args); diff --git a/cloud/filestore/libs/daemon/common/bootstrap.h b/cloud/filestore/libs/daemon/common/bootstrap.h index 634d2969b78..1172d53cd4e 100644 --- a/cloud/filestore/libs/daemon/common/bootstrap.h +++ b/cloud/filestore/libs/daemon/common/bootstrap.h @@ -73,7 +73,7 @@ class TBootstrapCommon ITaskQueuePtr BackgroundThreadPool; IProfileLogPtr ProfileLog; IActorSystemPtr ActorSystem; - NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher; + NCloud::NStorage::IStatsFetcherPtr StatsFetcher; public: TBootstrapCommon( diff --git a/cloud/filestore/libs/diagnostics/config.cpp b/cloud/filestore/libs/diagnostics/config.cpp index 391da103407..8b8d23bc456 100644 --- a/cloud/filestore/libs/diagnostics/config.cpp +++ b/cloud/filestore/libs/diagnostics/config.cpp @@ -41,6 +41,8 @@ namespace { \ xxx(HDDFileSystemPerformanceProfile, TFileSystemPerformanceProfile, {} )\ xxx(SSDFileSystemPerformanceProfile, TFileSystemPerformanceProfile, {} )\ + \ + xxx(StatsFetcherType, NCloud::NProto::EStatsFetcherType, NCloud::NProto::EStatsFetcherType::CGROUP )\ // FILESTORE_DIAGNOSTICS_CONFIG #define FILESTORE_DIAGNOSTICS_DECLARE_CONFIG(name, type, value) \ @@ -227,3 +229,12 @@ void Out( SerializeToTextFormat(v, out); } + +template <> +void Out( + IOutputStream& out, + NCloud::NProto::EStatsFetcherType statsFetcherType) +{ + out << NCloud::NProto::EStatsFetcherType_Name( + statsFetcherType); +} diff --git a/cloud/filestore/libs/diagnostics/config.h b/cloud/filestore/libs/diagnostics/config.h index fe473ac4d30..b0320cb67c1 100644 --- a/cloud/filestore/libs/diagnostics/config.h +++ b/cloud/filestore/libs/diagnostics/config.h @@ -137,6 +137,8 @@ class TDiagnosticsConfig TFileSystemPerformanceProfile GetHDDFileSystemPerformanceProfile() const; TFileSystemPerformanceProfile GetSSDFileSystemPerformanceProfile() const; + NCloud::NProto::EStatsFetcherType GetStatsFetcherType() const; + void Dump(IOutputStream& out) const; void DumpHtml(IOutputStream& out) const; }; diff --git a/cloud/filestore/libs/storage/init/actorsystem.cpp b/cloud/filestore/libs/storage/init/actorsystem.cpp index 3b315c645d8..4c476b90e00 100644 --- a/cloud/filestore/libs/storage/init/actorsystem.cpp +++ b/cloud/filestore/libs/storage/init/actorsystem.cpp @@ -85,7 +85,7 @@ class TStorageServicesInitializer final StatsRegistry, Args.ProfileLog, Args.TraceSerializer, - Args.CgroupStatsFetcher); + Args.StatsFetcher); setup->LocalServices.emplace_back( MakeStorageServiceId(), diff --git a/cloud/filestore/libs/storage/init/actorsystem.h b/cloud/filestore/libs/storage/init/actorsystem.h index 9f7dd198996..455f0a04916 100644 --- a/cloud/filestore/libs/storage/init/actorsystem.h +++ b/cloud/filestore/libs/storage/init/actorsystem.h @@ -34,7 +34,7 @@ struct TActorSystemArgs std::shared_ptr UserCounters; - NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher; + NCloud::NStorage::IStatsFetcherPtr StatsFetcher; }; //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/filestore/libs/storage/service/service.cpp b/cloud/filestore/libs/storage/service/service.cpp index d7d0321816a..5dbff3bc5d4 100644 --- a/cloud/filestore/libs/storage/service/service.cpp +++ b/cloud/filestore/libs/storage/service/service.cpp @@ -13,14 +13,14 @@ IActorPtr CreateStorageService( IRequestStatsRegistryPtr statsRegistry, IProfileLogPtr profileLog, ITraceSerializerPtr traceSerialzer, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher) + NCloud::NStorage::IStatsFetcherPtr statsFetcher) { return std::make_unique( std::move(storageConfig), std::move(statsRegistry), std::move(profileLog), std::move(traceSerialzer), - std::move(cgroupStatsFetcher)); + std::move(statsFetcher)); } } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/service/service.h b/cloud/filestore/libs/storage/service/service.h index 7222307d268..1bb16f2f986 100644 --- a/cloud/filestore/libs/storage/service/service.h +++ b/cloud/filestore/libs/storage/service/service.h @@ -16,6 +16,6 @@ NActors::IActorPtr CreateStorageService( IRequestStatsRegistryPtr statsRegistry, IProfileLogPtr profileLog, ITraceSerializerPtr traceSerialzer, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher); + NCloud::NStorage::IStatsFetcherPtr xtatsFetcher); } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/service/service_actor.cpp b/cloud/filestore/libs/storage/service/service_actor.cpp index c0bb6d292fb..967c04131e7 100644 --- a/cloud/filestore/libs/storage/service/service_actor.cpp +++ b/cloud/filestore/libs/storage/service/service_actor.cpp @@ -20,11 +20,11 @@ TStorageServiceActor::TStorageServiceActor( IRequestStatsRegistryPtr statsRegistry, IProfileLogPtr profileLog, ITraceSerializerPtr traceSerializer, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher) + NCloud::NStorage::IStatsFetcherPtr statsFetcher) : StorageConfig{std::move(storageConfig)} , ProfileLog{std::move(profileLog)} , TraceSerializer{std::move(traceSerializer)} - , CgroupStatsFetcher(std::move(cgroupStatsFetcher)) + , StatsFetcher(std::move(statsFetcher)) , State{std::make_unique()} , StatsRegistry{std::move(statsRegistry)} {} diff --git a/cloud/filestore/libs/storage/service/service_actor.h b/cloud/filestore/libs/storage/service/service_actor.h index 921f3010f95..8dc4002b3d7 100644 --- a/cloud/filestore/libs/storage/service/service_actor.h +++ b/cloud/filestore/libs/storage/service/service_actor.h @@ -36,7 +36,7 @@ class TStorageServiceActor final const TStorageConfigPtr StorageConfig; const IProfileLogPtr ProfileLog; const ITraceSerializerPtr TraceSerializer; - const NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher; + const NCloud::NStorage::IStatsFetcherPtr StatsFetcher; std::unique_ptr State; ui64 ProxyCounter = 0; @@ -64,7 +64,7 @@ class TStorageServiceActor final IRequestStatsRegistryPtr statsRegistry, IProfileLogPtr profileLog, ITraceSerializerPtr traceSerializer, - NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher); + NCloud::NStorage::IStatsFetcherPtr statsFetcher); ~TStorageServiceActor(); void Bootstrap(const NActors::TActorContext& ctx); diff --git a/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp b/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp index a6167b75774..0a2ff316334 100644 --- a/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp +++ b/cloud/filestore/libs/storage/service/service_actor_update_stats.cpp @@ -2,7 +2,7 @@ #include -#include +#include namespace NCloud::NFileStore::NStorage { @@ -65,11 +65,12 @@ void TStorageServiceActor::HandleUpdateStats( InFlightRequests.erase(it++); } } - if (CgroupStatsFetcher) { + + if (StatsFetcher) { auto now = ctx.Now(); auto interval = (now - LastCpuWaitQuery).MicroSeconds(); - if (auto [cpuWait, error] = CgroupStatsFetcher->GetCpuWait(); + if (auto [cpuWait, error] = StatsFetcher->GetCpuWait(); !HasError(error)) { *CpuWaitFailure = 0; diff --git a/cloud/filestore/libs/storage/testlib/test_env.cpp b/cloud/filestore/libs/storage/testlib/test_env.cpp index 8ff9737af06..7a72c74629e 100644 --- a/cloud/filestore/libs/storage/testlib/test_env.cpp +++ b/cloud/filestore/libs/storage/testlib/test_env.cpp @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -226,7 +226,7 @@ ui32 TTestEnv::CreateNode(const TString& name) StatsRegistry, ProfileLog, TraceSerializer, - CreateCgroupStatsFetcherStub()); + CreateStatsFetcherStub()); auto indexServiceId = Runtime.Register( indexService.release(), nodeIdx, diff --git a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp index e8aebada475..618f6445e8d 100644 --- a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp +++ b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.cpp @@ -1,13 +1,11 @@ -#include "cgroup_stats_fetcher.h" +#include "stats_fetcher.h" +#include #include #include -#include #include #include -#include -#include #include namespace NCloud::NStorage { @@ -17,7 +15,7 @@ namespace { //////////////////////////////////////////////////////////////////////////////// struct TCgroupStatsFetcher final - : public ICgroupStatsFetcher + : public IStatsFetcher { private: const TString ComponentName; @@ -122,30 +120,11 @@ struct TCgroupStatsFetcher final } }; -//////////////////////////////////////////////////////////////////////////////// - -struct TCgroupStatsFetcherStub final - : public ICgroupStatsFetcher -{ - void Start() override - { - } - - void Stop() override - { - } - - TResultOrError GetCpuWait() override - { - return TDuration::Zero(); - } -}; - } // namespace //////////////////////////////////////////////////////////////////////////////// -ICgroupStatsFetcherPtr CreateCgroupStatsFetcher( +IStatsFetcherPtr CreateCgroupStatsFetcher( TString componentName, ILoggingServicePtr logging, TString statsFile) @@ -156,39 +135,4 @@ ICgroupStatsFetcherPtr CreateCgroupStatsFetcher( std::move(statsFile)); } -ICgroupStatsFetcherPtr CreateCgroupStatsFetcherStub() -{ - return std::make_shared(); -} - -TString BuildCpuWaitStatsFilename(const TString& serviceName) -{ - static constexpr auto CpuWaitStatsFilenameTemplate = - "/sys/fs/cgroup/cpu/system.slice/%s.service/cpuacct.wait"; - if (!serviceName.empty()) { - return Sprintf(CpuWaitStatsFilenameTemplate, serviceName.c_str()); - } - return {}; -} - -NCloud::NStorage::ICgroupStatsFetcherPtr BuildCgroupStatsFetcher( - TString cpuWaitFilename, - const TLog& log, - ILoggingServicePtr logging, - TString componentName) -{ - if (cpuWaitFilename.empty()) { - const auto& Log = log; - STORAGE_INFO( - "CpuWaitServiceName and CpuWaitFilename are empty, can't build " - "CgroupStatsFetcher"); - return CreateCgroupStatsFetcherStub(); - } - - return CreateCgroupStatsFetcher( - std::move(componentName), - std::move(logging), - std::move(cpuWaitFilename)); -}; - } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp index 1c886361c41..3cfb512ef78 100644 --- a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp +++ b/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher_ut.cpp @@ -1,4 +1,4 @@ -#include "cgroup_stats_fetcher.h" +#include "stats_fetcher.h" #include "critical_events.h" diff --git a/cloud/storage/core/libs/diagnostics/public.h b/cloud/storage/core/libs/diagnostics/public.h index 09a483d193f..91b0af14d31 100644 --- a/cloud/storage/core/libs/diagnostics/public.h +++ b/cloud/storage/core/libs/diagnostics/public.h @@ -84,8 +84,8 @@ namespace NStorage { //////////////////////////////////////////////////////////////////////////////// -struct ICgroupStatsFetcher; -using ICgroupStatsFetcherPtr = std::shared_ptr; +struct IStatsFetcher; +using IStatsFetcherPtr = std::shared_ptr; } // namespace NStorage diff --git a/cloud/storage/core/libs/diagnostics/qemu_ut/bin/ya.make b/cloud/storage/core/libs/diagnostics/qemu_ut/bin/ya.make new file mode 100644 index 00000000000..37e16a42a35 --- /dev/null +++ b/cloud/storage/core/libs/diagnostics/qemu_ut/bin/ya.make @@ -0,0 +1,9 @@ +UNITTEST_FOR(cloud/storage/core/libs/diagnostics) + +IF (OS_LINUX) + SRCS( + task_stats_fetcher_ut.cpp + ) +ENDIF() + +END() diff --git a/cloud/storage/core/libs/diagnostics/qemu_ut/test.py b/cloud/storage/core/libs/diagnostics/qemu_ut/test.py new file mode 100644 index 00000000000..8ef1266040a --- /dev/null +++ b/cloud/storage/core/libs/diagnostics/qemu_ut/test.py @@ -0,0 +1,9 @@ +import yatest.common as common + +tests_bin = "cloud-storage-core-libs-diagnostics-qemu_ut-bin" +tests_bin_path = "cloud/storage/core/libs/diagnostics/qemu_ut/bin/" + tests_bin + + +def test_qemu_ut(): + test_tool = common.binary_path(tests_bin_path) + common.execute(test_tool) diff --git a/cloud/storage/core/libs/diagnostics/qemu_ut/ya.make b/cloud/storage/core/libs/diagnostics/qemu_ut/ya.make new file mode 100644 index 00000000000..8ea9261a722 --- /dev/null +++ b/cloud/storage/core/libs/diagnostics/qemu_ut/ya.make @@ -0,0 +1,16 @@ +PY3TEST() + +INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/medium.inc) +SPLIT_FACTOR(1) + +DEPENDS( + cloud/storage/core/libs/diagnostics/qemu_ut/bin +) + +TEST_SRCS( + test.py +) + +INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/qemu.inc) + +END() diff --git a/cloud/storage/core/libs/diagnostics/stats_fetcher.cpp b/cloud/storage/core/libs/diagnostics/stats_fetcher.cpp new file mode 100644 index 00000000000..6770576f501 --- /dev/null +++ b/cloud/storage/core/libs/diagnostics/stats_fetcher.cpp @@ -0,0 +1,79 @@ +#include "stats_fetcher.h" + +#include +#include + +#include + +namespace NCloud::NStorage { + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +struct TStatsFetcherStub final + : public IStatsFetcher +{ + void Start() override + { + } + + void Stop() override + { + } + + TResultOrError GetCpuWait() override + { + return TDuration::Zero(); + } +}; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +IStatsFetcherPtr CreateStatsFetcherStub() +{ + return std::make_shared(); +} + +TString BuildCpuWaitStatsFilename(const TString& serviceName) +{ + static constexpr auto CpuWaitStatsFilenameTemplate = + "/sys/fs/cgroup/cpu/system.slice/%s.service/cpuacct.wait"; + if (!serviceName.empty()) { + return Sprintf(CpuWaitStatsFilenameTemplate, serviceName.c_str()); + } + return {}; +} + +IStatsFetcherPtr BuildStatsFetcher( + NProto::EStatsFetcherType statsFetcherType, + const TString& cpuWaitFilename, + const TLog& log, + ILoggingServicePtr logging) +{ + switch (statsFetcherType) { + case NCloud::NProto::CGROUP: { + if (cpuWaitFilename.empty()) { + const auto& Log = log; + STORAGE_INFO( + "CpuWaitFilename is empty, can't build " + "CgroupStatsFetcher"); + return CreateStatsFetcherStub(); + } + + return CreateCgroupStatsFetcher( + "STORAGE_STATS", + std::move(logging), + std::move(cpuWaitFilename)); + } + case NCloud::NProto::TASKSTATS: + return CreateTaskStatsFetcher( + "STORAGE_STATS", + std::move(logging), + getpid()); + } +} + +} // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h b/cloud/storage/core/libs/diagnostics/stats_fetcher.h similarity index 60% rename from cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h rename to cloud/storage/core/libs/diagnostics/stats_fetcher.h index 19051ce8621..9f5b7ef7ac3 100644 --- a/cloud/storage/core/libs/diagnostics/cgroup_stats_fetcher.h +++ b/cloud/storage/core/libs/diagnostics/stats_fetcher.h @@ -4,6 +4,7 @@ #include #include +#include #include @@ -17,31 +18,36 @@ namespace NCloud::NStorage { //////////////////////////////////////////////////////////////////////////////// -struct ICgroupStatsFetcher +struct IStatsFetcher : public IStartable { - virtual ~ICgroupStatsFetcher() = default; + virtual ~IStatsFetcher() = default; virtual TResultOrError GetCpuWait() = 0; }; -using ICgroupStatsFetcherPtr = std::shared_ptr; +using IStatsFetcherPtr = std::shared_ptr; //////////////////////////////////////////////////////////////////////////////// -ICgroupStatsFetcherPtr CreateCgroupStatsFetcher( +IStatsFetcherPtr CreateCgroupStatsFetcher( TString componentName, ILoggingServicePtr logging, TString statsFile); -ICgroupStatsFetcherPtr CreateCgroupStatsFetcherStub(); +IStatsFetcherPtr CreateTaskStatsFetcher( + TString componentName, + ILoggingServicePtr logging, + int pid); + +IStatsFetcherPtr CreateStatsFetcherStub(); TString BuildCpuWaitStatsFilename(const TString& serviceName); -ICgroupStatsFetcherPtr BuildCgroupStatsFetcher( - TString cpuWaitFilename, +IStatsFetcherPtr BuildStatsFetcher( + NProto::EStatsFetcherType statsFetcherType, + const TString& cpuWaitFilename, const TLog& log, - ILoggingServicePtr logging, - TString componentName); + ILoggingServicePtr logging); } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/diagnostics/task_stats_fetcher.cpp b/cloud/storage/core/libs/diagnostics/task_stats_fetcher.cpp new file mode 100644 index 00000000000..b929cb6bea1 --- /dev/null +++ b/cloud/storage/core/libs/diagnostics/task_stats_fetcher.cpp @@ -0,0 +1,252 @@ +#include "stats_fetcher.h" + +#include +#include + +#include +#include +#include + +#include +#include + +namespace NCloud::NStorage { + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +void ValidateAttribute(const ::nlattr& attribute, ui16 expectedAttribute) +{ + if (attribute.nla_type != expectedAttribute) { + throw yexception() << "Invalid attribute type: " << attribute.nla_type + << " Expected attribute type: " << expectedAttribute; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +// Documentation: +// https://github.com/torvalds/linux/blob/master/Documentation/accounting/taskstats.rst + +#pragma pack(push, NLMSG_ALIGNTO) + +struct TTaskStatsFamilyIdRequest +{ + ::nlmsghdr MessageHeader = + {sizeof(TTaskStatsFamilyIdRequest), GENL_ID_CTRL, NLM_F_REQUEST, 0, 0}; + ::genlmsghdr GenericHeader = {CTRL_CMD_GETFAMILY, 1, 0}; + ::nlattr FamilyNameAttr = { + sizeof(FamilyName) + NLA_HDRLEN, + CTRL_ATTR_FAMILY_NAME}; + const char FamilyName[sizeof(TASKSTATS_GENL_NAME)] = TASKSTATS_GENL_NAME; +}; + +struct TTaskStatsFamilyIdResponse +{ + ::nlmsghdr MessageHeader; + ::genlmsghdr GenericHeader; + ::nlattr FamilyNameAttr; + char FamilyName[sizeof(TASKSTATS_GENL_NAME)]; + alignas(NLMSG_ALIGNTO)::nlattr FamilyIdAttr; + ui16 FamilyId; + + void Validate() + { + ValidateAttribute(FamilyNameAttr, CTRL_ATTR_FAMILY_NAME); + ValidateAttribute(FamilyIdAttr, CTRL_ATTR_FAMILY_ID); + } +}; + +struct TTaskStatsRequest +{ + ::nlmsghdr MessageHeader; + ::genlmsghdr GenericHeader; + ::nlattr PidAttr; + ui32 Pid; + + TTaskStatsRequest(ui16 familyId, ui32 pid) + : MessageHeader{sizeof(TTaskStatsRequest), familyId, NLM_F_REQUEST, 0, 0} + , GenericHeader{TASKSTATS_CMD_GET, 1, 0} + , PidAttr{sizeof(Pid) + NLA_HDRLEN, TASKSTATS_CMD_ATTR_PID} + , Pid(pid) + {} +}; + +struct TTaskStatsResponse +{ + ::nlmsghdr MessageHeader; + ::genlmsghdr GenericHeader; + ::nlattr AggrPidAttr; + ::nlattr PidAttr; + ui32 Pid; + ::nlattr TaskStatsAttr; + ::taskstats TaskStats; + + void Validate() + { + ValidateAttribute(AggrPidAttr, TASKSTATS_TYPE_AGGR_PID); + ValidateAttribute(PidAttr, TASKSTATS_TYPE_PID); + ValidateAttribute(TaskStatsAttr, TASKSTATS_TYPE_STATS); + } +}; + +#pragma pack(pop) + +//////////////////////////////////////////////////////////////////////////////// + +template +union TNetlinkResponse { + T Msg; + ui8 Buffer[MaxMsgSize]; + + TNetlinkResponse() { + static_assert(sizeof(T) < MaxMsgSize); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TNetlinkSocket +{ +private: + TSocket Socket; + ui32 SocketTimeoutMs = 100; + +public: + TNetlinkSocket(ui32 socketTimeoutMs = 100) + : Socket(::socket(PF_NETLINK, SOCK_RAW, NETLINK_GENERIC)) + , SocketTimeoutMs(socketTimeoutMs) + { + if (Socket < 0) { + throw yexception() << "Failed to create netlink socket"; + } + Socket.SetSocketTimeout(0, SocketTimeoutMs); + } + + template + void Send(const TNetlinkMessage& msg) + { + auto ret = Socket.Send(&msg, sizeof(msg)); + if (ret == -1) { + throw yexception() + << "Failed to send netlink message: " << strerror(errno); + } + } + + template + void Receive(TNetlinkResponse& response) + { + auto ret = Socket.Recv(&response, sizeof(response)); + if (ret < 0) { + throw yexception() + << "Failed to receive netlink message: " << strerror(errno); + } + + if (response.Msg.MessageHeader.nlmsg_type == NLMSG_ERROR) { + throw yexception() + << "Failed to receive netlink message: kernel returned error"; + } + + if (!NLMSG_OK(&response.Msg.MessageHeader, ret)) { + throw yexception() + << "Failed to parse netlink message: incorrect format"; + } + return; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TTaskStatsFetcher final: public IStatsFetcher +{ +private: + const TString ComponentName; + const ILoggingServicePtr Logging; + int Pid; + TLog Log; + const TDuration NetlinkSocketTimeout = TDuration::Seconds(1); + TDuration Last; + ui16 FamilyId; + + ui16 GetFamilyId() + { + TNetlinkSocket socket; + socket.Send(TTaskStatsFamilyIdRequest()); + TNetlinkResponse response; + socket.Receive(response); + response.Msg.Validate(); + return response.Msg.FamilyId; + } + +public: + TTaskStatsFetcher( + TString componentName, + ILoggingServicePtr logging, + int pid) + : ComponentName(std::move(componentName)) + , Logging(std::move(logging)) + , Pid(pid) + , FamilyId(0) + { + } + + ~TTaskStatsFetcher() override + { + Stop(); + } + + void Start() override + { + Log = Logging->CreateLog(ComponentName); + } + + void Stop() override + { + } + + TResultOrError GetCpuWait() override + { + try { + if (FamilyId == 0) { + FamilyId = GetFamilyId(); + } + + TNetlinkSocket socket; + socket.Send(TTaskStatsRequest(FamilyId, Pid)); + TNetlinkResponse response; + socket.Receive(response); + response.Msg.Validate(); + auto cpuLack = TDuration::MilliSeconds( + response.Msg.TaskStats.cpu_delay_total / 1000); + auto retval = cpuLack - Last; + Last = cpuLack; + return retval; + } catch (...) { + auto errorMessage = BuildErrorMessageFromException(); + return MakeError(E_FAIL, errorMessage); + } + } + + TString BuildErrorMessageFromException() + { + auto msg = TStringBuilder() << "IO error"; + msg << " with exception " << CurrentExceptionMessage(); + return msg; + } +}; + +} // namespace + +IStatsFetcherPtr CreateTaskStatsFetcher( + TString componentName, + ILoggingServicePtr logging, + int pid) +{ + return std::make_shared( + std::move(componentName), + std::move(logging), + pid); +} + +} // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/diagnostics/task_stats_fetcher_ut.cpp b/cloud/storage/core/libs/diagnostics/task_stats_fetcher_ut.cpp new file mode 100644 index 00000000000..eb329ac2102 --- /dev/null +++ b/cloud/storage/core/libs/diagnostics/task_stats_fetcher_ut.cpp @@ -0,0 +1,39 @@ +#include "stats_fetcher.h" + +#include "critical_events.h" + +#include +#include + +#include +#include + +#include + +namespace NCloud::NStorage { + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +const TString ComponentName = "STORAGE_STATS"; + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +Y_UNIT_TEST_SUITE(TaskStatsFetcherTest) +{ + Y_UNIT_TEST(ShouldGetCpuWait) + { + auto fetcher = CreateTaskStatsFetcher( + ComponentName, + CreateLoggingService("console"), + getpid()); + fetcher->Start(); + auto [cpuWait, error] = fetcher->GetCpuWait(); + UNIT_ASSERT_C(!HasError(error), error); + } +} + +} // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/diagnostics/ya.make b/cloud/storage/core/libs/diagnostics/ya.make index 78876ea5f3a..57be7f8c1ac 100644 --- a/cloud/storage/core/libs/diagnostics/ya.make +++ b/cloud/storage/core/libs/diagnostics/ya.make @@ -2,8 +2,8 @@ LIBRARY() SRCS( busy_idle_calculator.cpp - cgroup_stats_fetcher.cpp counters_helper.cpp + cgroup_stats_fetcher.cpp critical_events.cpp executor_counters.cpp histogram_types.cpp @@ -15,7 +15,9 @@ SRCS( postpone_time_predictor.cpp request_counters.cpp solomon_counters.cpp + stats_fetcher.cpp stats_updater.cpp + task_stats_fetcher.cpp trace_processor_mon.cpp trace_processor.cpp trace_reader.cpp @@ -46,4 +48,5 @@ PEERDIR( END() +RECURSE_FOR_TESTS(qemu_ut) RECURSE_FOR_TESTS(ut) diff --git a/cloud/storage/core/protos/diagnostics.proto b/cloud/storage/core/protos/diagnostics.proto new file mode 100644 index 00000000000..79cfffcc6b5 --- /dev/null +++ b/cloud/storage/core/protos/diagnostics.proto @@ -0,0 +1,15 @@ +syntax = "proto2"; + +package NCloud.NProto; + +option go_package = "github.com/ydb-platform/nbs/cloud/storage/core/protos"; + +//////////////////////////////////////////////////////////////////////////////// +// CPU stats montitoring type + +enum EStatsFetcherType +{ + CGROUP = 0; + TASKSTATS = 1; +}; + diff --git a/cloud/storage/core/protos/ya.make b/cloud/storage/core/protos/ya.make index 1546617f38e..84f654a146d 100644 --- a/cloud/storage/core/protos/ya.make +++ b/cloud/storage/core/protos/ya.make @@ -11,6 +11,7 @@ SRCS( authorization_mode.proto certificate.proto config_dispatcher_settings.proto + diagnostics.proto endpoints.proto error.proto media.proto diff --git a/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp b/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp index e8663e38414..e63a6448e36 100644 --- a/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp +++ b/cloud/storage/core/tools/analytics/cpu-wait-monitor/main.cpp @@ -1,4 +1,4 @@ -#include +#include #include #include