diff --git a/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp b/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp index 92016b6ec2e..4609cd63b34 100644 --- a/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp +++ b/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp @@ -52,15 +52,20 @@ class TStorageServicesInitializer final // HiveProxy // - auto hiveProxy = CreateHiveProxy({ - .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), - .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), - .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), - .LogComponent = TBlockStoreComponents::HIVE_PROXY, - .TabletBootInfoBackupFilePath = {}, - .FallbackMode = false, - .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), - }); + auto hiveProxy = CreateHiveProxy( + { + .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), + .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), + .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), + .LogComponent = TBlockStoreComponents::HIVE_PROXY, + .TabletBootInfoBackupFilePath = {}, + .FallbackMode = false, + .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), + }, + appData + ->Counters + ->GetSubgroup("counters", "blockstore") + ->GetSubgroup("component", "service")); setup->LocalServices.emplace_back( MakeHiveProxyServiceId(), diff --git a/cloud/blockstore/libs/storage/init/server/actorsystem.cpp b/cloud/blockstore/libs/storage/init/server/actorsystem.cpp index 92f636ade37..feb47d5e1f2 100644 --- a/cloud/blockstore/libs/storage/init/server/actorsystem.cpp +++ b/cloud/blockstore/libs/storage/init/server/actorsystem.cpp @@ -138,15 +138,20 @@ class TStorageServicesInitializer final // HiveProxy // - auto hiveProxy = CreateHiveProxy({ - .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), - .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), - .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), - .LogComponent = TBlockStoreComponents::HIVE_PROXY, - .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), - .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), - .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), - }); + auto hiveProxy = CreateHiveProxy( + { + .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), + .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), + .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), + .LogComponent = TBlockStoreComponents::HIVE_PROXY, + .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), + .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), + .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), + }, + appData + ->Counters + ->GetSubgroup("counters", "blockstore") + ->GetSubgroup("component", "service")); setup->LocalServices.emplace_back( MakeHiveProxyServiceId(), diff --git a/cloud/filestore/libs/storage/init/actorsystem.cpp b/cloud/filestore/libs/storage/init/actorsystem.cpp index 4c476b90e00..50d01323964 100644 --- a/cloud/filestore/libs/storage/init/actorsystem.cpp +++ b/cloud/filestore/libs/storage/init/actorsystem.cpp @@ -124,16 +124,21 @@ class TStorageServicesInitializer final // HiveProxy // - auto hiveProxy = CreateHiveProxy({ - .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), - .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), - // HiveLockExpireTimeout, used by NBS, doesn't matter - .HiveLockExpireTimeout = TDuration::Seconds(1), - .LogComponent = TFileStoreComponents::HIVE_PROXY, - .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), - .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), - .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), - }); + auto hiveProxy = CreateHiveProxy( + { + .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), + .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), + // HiveLockExpireTimeout, used by NBS, doesn't matter + .HiveLockExpireTimeout = TDuration::Seconds(1), + .LogComponent = TFileStoreComponents::HIVE_PROXY, + .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), + .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), + .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), + }, + appData + ->Counters + ->GetSubgroup("counters", "filestore") + ->GetSubgroup("component", "service")); setup->LocalServices.emplace_back( MakeHiveProxyServiceId(), diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp index 17eb0ba3a07..9719bff23db 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp @@ -10,12 +10,21 @@ using namespace NActors; //////////////////////////////////////////////////////////////////////////////// IActorPtr CreateHiveProxy(THiveProxyConfig config) +{ + return CreateHiveProxy(std::move(config), {}); +} + +IActorPtr CreateHiveProxy( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr counters) { if (config.FallbackMode) { return std::make_unique(std::move(config)); } - return std::make_unique(std::move(config)); + return std::make_unique( + std::move(config), + std::move(counters)); } } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy.h b/cloud/storage/core/libs/hive_proxy/hive_proxy.h index ee9398bf122..187fed6c310 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy.h +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy.h @@ -5,10 +5,15 @@ #include #include +#include + namespace NCloud::NStorage { //////////////////////////////////////////////////////////////////////////////// NActors::IActorPtr CreateHiveProxy(THiveProxyConfig config); +NActors::IActorPtr CreateHiveProxy( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr counters); } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp index ed8198233a5..e9e5c0616f6 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp @@ -34,12 +34,19 @@ std::unique_ptr CreateTabletPipeClientCache( //////////////////////////////////////////////////////////////////////////////// -THiveProxyActor::THiveProxyActor(THiveProxyConfig config) +THiveProxyActor::THiveProxyActor( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr counters) : ClientCache(CreateTabletPipeClientCache(config)) , LockExpireTimeout(config.HiveLockExpireTimeout) , LogComponent(config.LogComponent) , TabletBootInfoBackupFilePath(config.TabletBootInfoBackupFilePath) , TenantHiveTabletId(config.TenantHiveTabletId) + , Counters(std::move(counters)) +{} + +THiveProxyActor::THiveProxyActor(THiveProxyConfig config) + : THiveProxyActor(std::move(config), {}) {} void THiveProxyActor::Bootstrap(const TActorContext& ctx) @@ -55,10 +62,24 @@ void THiveProxyActor::Bootstrap(const TActorContext& ctx) TabletBootInfoBackup = ctx.Register( cache.release(), TMailboxType::HTSwap, AppData()->IOPoolId); } + if (Counters) { + HiveReconnectTimeCounter = Counters->GetCounter("HiveReconnectTime", true); + } } //////////////////////////////////////////////////////////////////////////////// +void THiveProxyActor::SendRequest( + const TActorContext& ctx, + ui64 hive, + IEventBase* request) +{ + ClientCache->Send(ctx, hive, request); + if (HiveDisconnected) { + HiveReconnectStartCycles = GetCycleCount(); + } +} + ui64 THiveProxyActor::GetHive( const TActorContext& ctx, ui64 tabletId, @@ -98,7 +119,7 @@ void THiveProxyActor::SendLockRequest( hiveRequest->Record.SetMaxReconnectTimeout( LockExpireTimeout.MilliSeconds()); hiveRequest->Record.SetReconnect(reconnect); - ClientCache->Send(ctx, hive, hiveRequest.release()); + SendRequest(ctx, hive, hiveRequest.release()); } void THiveProxyActor::SendUnlockRequest( @@ -106,7 +127,7 @@ void THiveProxyActor::SendUnlockRequest( { auto hiveRequest = std::make_unique(tabletId); - ClientCache->Send(ctx, hive, hiveRequest.release()); + SendRequest(ctx, hive, hiveRequest.release()); } void THiveProxyActor::SendGetTabletStorageInfoRequest( @@ -115,7 +136,7 @@ void THiveProxyActor::SendGetTabletStorageInfoRequest( { auto hiveRequest = std::make_unique(tabletId); - ClientCache->Send(ctx, hive, hiveRequest.release()); + SendRequest(ctx, hive, hiveRequest.release()); } void THiveProxyActor::SendLockReply( @@ -198,7 +219,7 @@ void THiveProxyActor::SendTabletMetrics( prTabletId.second.OnStatsSend(); } if (record.TabletMetricsSize() > 0) { - ClientCache->Send(ctx, hive, event.Release()); + SendRequest(ctx, hive, event.Release()); } } @@ -216,6 +237,14 @@ void THiveProxyActor::HandleConnect( auto error = MakeKikimrError(msg->Status, TStringBuilder() << "Connect to hive " << hive << " failed"); HandleConnectionError(ctx, error, hive, true); + } else if (HiveReconnectStartCycles) { + if (HiveReconnectTimeCounter) { + HiveReconnectTimeCounter->Add( + CyclesToDuration( + GetCycleCount() - HiveReconnectStartCycles).MicroSeconds()); + } + HiveReconnectStartCycles = 0; + HiveDisconnected = false; } } @@ -242,6 +271,8 @@ void THiveProxyActor::HandleConnectionError( Y_UNUSED(error); Y_UNUSED(connectFailed); + HiveDisconnected = true; + LOG_ERROR_S(ctx, LogComponent, "Pipe to hive" << hive << " has been reset "); @@ -307,6 +338,9 @@ void THiveProxyActor::HandleConnectionError( for (const auto& actorId: states->Actors) { auto clientId = ClientCache->Prepare(ctx, hive); + if (!HiveReconnectStartCycles) { + HiveReconnectStartCycles = GetCycleCount(); + } NCloud::Send( ctx, actorId, diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h index 75920d897e1..04b54f7efe2 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h @@ -177,14 +177,28 @@ class THiveProxyActor final const ui64 TenantHiveTabletId; + const NMonitoring::TDynamicCounterPtr Counters; + NMonitoring::TDynamicCounters::TCounterPtr HiveReconnectTimeCounter; + ui64 HiveReconnectStartCycles = 0; + bool HiveDisconnected = true; + public: explicit THiveProxyActor(THiveProxyConfig config); + THiveProxyActor( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr counters); + void Bootstrap(const NActors::TActorContext& ctx); private: STFUNC(StateWork); + void SendRequest( + const NActors::TActorContext& ctx, + ui64 hive, + NActors::IEventBase* request); + ui64 GetHive( const NActors::TActorContext& ctx, ui64 tabletId, diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp index 7d51f50452b..20c292e4950 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp @@ -513,7 +513,9 @@ struct TTestEnv .TenantHiveTabletId = tenantHive, }; HiveProxyActorId = Runtime.Register( - CreateHiveProxy(std::move(config)).release()); + CreateHiveProxy( + std::move(config), + Runtime.GetAppData(0).Counters).release()); Runtime.EnableScheduleForActor(HiveProxyActorId); Runtime.RegisterService(MakeHiveProxyServiceId(), HiveProxyActorId); } @@ -1541,6 +1543,45 @@ Y_UNIT_TEST_SUITE(THiveProxyTest) UNIT_ASSERT_VALUES_EQUAL(1, hiveMessages); UNIT_ASSERT_VALUES_EQUAL(1, wakeups); } + + Y_UNIT_TEST(ShouldReportHiveReconnectTime) + { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + + auto sender = runtime.AllocateEdgeActor(); + + auto counter = env.Runtime.GetAppData(0).Counters + ->GetCounter("HiveReconnectTime", true); + + env.SendLockRequest(sender, FakeTablet2); + UNIT_ASSERT_VALUES_UNEQUAL(0, counter->Val()); + + auto oldVal = counter->Val(); + + int hiveLockRequests = 0; + runtime.SetObserverFunc([&](TAutoPtr& event) { + Y_UNUSED(runtime); + if (event->GetTypeRewrite() == TEvHive::EvLockTabletExecution) { + ++hiveLockRequests; + } + return TTestActorRuntime::EEventAction::PROCESS; + }); + + env.EnableTabletResolverScheduling(); + env.RebootHive(); + + while (!hiveLockRequests) { + // Pipe to hive may take a long time to connect + // Wait until hive receives the lock request + runtime.DispatchEvents(TDispatchOptions(), TDuration::Seconds(1)); + } + + runtime.SetObserverFunc(&TTestActorRuntime::DefaultObserverFunc); + + // Rebooting hive should reconnect the lock + UNIT_ASSERT_GT(counter->Val(), oldVal); + } } } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/ya.make b/cloud/storage/core/libs/hive_proxy/ya.make index 15f253fb11c..794d095164e 100644 --- a/cloud/storage/core/libs/hive_proxy/ya.make +++ b/cloud/storage/core/libs/hive_proxy/ya.make @@ -27,6 +27,8 @@ PEERDIR( contrib/ydb/core/tablet_flat contrib/ydb/library/actors/core + + library/cpp/monlib/dynamic_counters ) END()