From d075141e294c3973e1dda6610cb536c63b3b93e8 Mon Sep 17 00:00:00 2001 From: Egor Masharskii Date: Mon, 3 Feb 2025 20:49:58 +0000 Subject: [PATCH 1/4] issue-2969: add hive reconnect time counter --- .../storage/init/disk_agent/actorsystem.cpp | 23 +++++---- .../libs/storage/init/server/actorsystem.cpp | 23 +++++---- .../libs/storage/init/actorsystem.cpp | 25 ++++++---- .../core/libs/hive_proxy/hive_proxy.cpp | 13 +++++ .../storage/core/libs/hive_proxy/hive_proxy.h | 5 ++ .../core/libs/hive_proxy/hive_proxy_actor.cpp | 49 +++++++++++++++++-- .../core/libs/hive_proxy/hive_proxy_actor.h | 14 ++++++ .../core/libs/hive_proxy/hive_proxy_ut.cpp | 42 +++++++++++++++- cloud/storage/core/libs/hive_proxy/ya.make | 2 + 9 files changed, 162 insertions(+), 34 deletions(-) diff --git a/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp b/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp index 92016b6ec2e..4609cd63b34 100644 --- a/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp +++ b/cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp @@ -52,15 +52,20 @@ class TStorageServicesInitializer final // HiveProxy // - auto hiveProxy = CreateHiveProxy({ - .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), - .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), - .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), - .LogComponent = TBlockStoreComponents::HIVE_PROXY, - .TabletBootInfoBackupFilePath = {}, - .FallbackMode = false, - .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), - }); + auto hiveProxy = CreateHiveProxy( + { + .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), + .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), + .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), + .LogComponent = TBlockStoreComponents::HIVE_PROXY, + .TabletBootInfoBackupFilePath = {}, + .FallbackMode = false, + .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), + }, + appData + ->Counters + ->GetSubgroup("counters", "blockstore") + ->GetSubgroup("component", "service")); setup->LocalServices.emplace_back( MakeHiveProxyServiceId(), diff --git a/cloud/blockstore/libs/storage/init/server/actorsystem.cpp b/cloud/blockstore/libs/storage/init/server/actorsystem.cpp index ea5b107cb3b..0fa97acf011 100644 --- a/cloud/blockstore/libs/storage/init/server/actorsystem.cpp +++ b/cloud/blockstore/libs/storage/init/server/actorsystem.cpp @@ -138,15 +138,20 @@ class TStorageServicesInitializer final // HiveProxy // - auto hiveProxy = CreateHiveProxy({ - .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), - .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), - .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), - .LogComponent = TBlockStoreComponents::HIVE_PROXY, - .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), - .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), - .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), - }); + auto hiveProxy = CreateHiveProxy( + { + .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), + .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), + .HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(), + .LogComponent = TBlockStoreComponents::HIVE_PROXY, + .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), + .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), + .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), + }, + appData + ->Counters + ->GetSubgroup("counters", "blockstore") + ->GetSubgroup("component", "service")); setup->LocalServices.emplace_back( MakeHiveProxyServiceId(), diff --git a/cloud/filestore/libs/storage/init/actorsystem.cpp b/cloud/filestore/libs/storage/init/actorsystem.cpp index 3b315c645d8..b40063cbdf5 100644 --- a/cloud/filestore/libs/storage/init/actorsystem.cpp +++ b/cloud/filestore/libs/storage/init/actorsystem.cpp @@ -124,16 +124,21 @@ class TStorageServicesInitializer final // HiveProxy // - auto hiveProxy = CreateHiveProxy({ - .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), - .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), - // HiveLockExpireTimeout, used by NBS, doesn't matter - .HiveLockExpireTimeout = TDuration::Seconds(1), - .LogComponent = TFileStoreComponents::HIVE_PROXY, - .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), - .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), - .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), - }); + auto hiveProxy = CreateHiveProxy( + { + .PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(), + .PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(), + // HiveLockExpireTimeout, used by NBS, doesn't matter + .HiveLockExpireTimeout = TDuration::Seconds(1), + .LogComponent = TFileStoreComponents::HIVE_PROXY, + .TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(), + .FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(), + .TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(), + }, + appData + ->Counters + ->GetSubgroup("counters", "filestore") + ->GetSubgroup("component", "service")); setup->LocalServices.emplace_back( MakeHiveProxyServiceId(), diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp index 17eb0ba3a07..2f03716b145 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp @@ -18,4 +18,17 @@ IActorPtr CreateHiveProxy(THiveProxyConfig config) return std::make_unique(std::move(config)); } +IActorPtr CreateHiveProxy( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr CountersRoot) +{ + if (config.FallbackMode) { + return std::make_unique(std::move(config)); + } + + return std::make_unique( + std::move(config), + std::move(CountersRoot)); +} + } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy.h b/cloud/storage/core/libs/hive_proxy/hive_proxy.h index ee9398bf122..7f231bfa645 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy.h +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy.h @@ -5,10 +5,15 @@ #include #include +#include + namespace NCloud::NStorage { //////////////////////////////////////////////////////////////////////////////// NActors::IActorPtr CreateHiveProxy(THiveProxyConfig config); +NActors::IActorPtr CreateHiveProxy( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr CountersRoot); } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp index ed8198233a5..00623f5eab2 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp @@ -34,12 +34,19 @@ std::unique_ptr CreateTabletPipeClientCache( //////////////////////////////////////////////////////////////////////////////// -THiveProxyActor::THiveProxyActor(THiveProxyConfig config) +THiveProxyActor::THiveProxyActor( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr countersRoot) : ClientCache(CreateTabletPipeClientCache(config)) , LockExpireTimeout(config.HiveLockExpireTimeout) , LogComponent(config.LogComponent) , TabletBootInfoBackupFilePath(config.TabletBootInfoBackupFilePath) , TenantHiveTabletId(config.TenantHiveTabletId) + , CountersRoot(std::move(countersRoot)) +{} + +THiveProxyActor::THiveProxyActor(THiveProxyConfig config) + : THiveProxyActor(std::move(config), {}) {} void THiveProxyActor::Bootstrap(const TActorContext& ctx) @@ -55,10 +62,26 @@ void THiveProxyActor::Bootstrap(const TActorContext& ctx) TabletBootInfoBackup = ctx.Register( cache.release(), TMailboxType::HTSwap, AppData()->IOPoolId); } + if (CountersRoot) { + ReconnectPipeCounter = CountersRoot->GetCounter("HiveReconnectTime", true); + } } //////////////////////////////////////////////////////////////////////////////// +void THiveProxyActor::SendRequest( + const TActorContext& ctx, + ui64 hive, + IEventBase* request) +{ + if (auto clientId = ClientCache->Send(ctx, hive, request); + clientId != HiveClientId) + { + HiveClientId = clientId; + ReconnectStartTime = GetCycleCount(); + } +} + ui64 THiveProxyActor::GetHive( const TActorContext& ctx, ui64 tabletId, @@ -98,7 +121,7 @@ void THiveProxyActor::SendLockRequest( hiveRequest->Record.SetMaxReconnectTimeout( LockExpireTimeout.MilliSeconds()); hiveRequest->Record.SetReconnect(reconnect); - ClientCache->Send(ctx, hive, hiveRequest.release()); + SendRequest(ctx, hive, hiveRequest.release()); } void THiveProxyActor::SendUnlockRequest( @@ -106,7 +129,7 @@ void THiveProxyActor::SendUnlockRequest( { auto hiveRequest = std::make_unique(tabletId); - ClientCache->Send(ctx, hive, hiveRequest.release()); + SendRequest(ctx, hive, hiveRequest.release()); } void THiveProxyActor::SendGetTabletStorageInfoRequest( @@ -115,7 +138,7 @@ void THiveProxyActor::SendGetTabletStorageInfoRequest( { auto hiveRequest = std::make_unique(tabletId); - ClientCache->Send(ctx, hive, hiveRequest.release()); + SendRequest(ctx, hive, hiveRequest.release()); } void THiveProxyActor::SendLockReply( @@ -198,7 +221,7 @@ void THiveProxyActor::SendTabletMetrics( prTabletId.second.OnStatsSend(); } if (record.TabletMetricsSize() > 0) { - ClientCache->Send(ctx, hive, event.Release()); + SendRequest(ctx, hive, event.Release()); } } @@ -216,6 +239,13 @@ void THiveProxyActor::HandleConnect( auto error = MakeKikimrError(msg->Status, TStringBuilder() << "Connect to hive " << hive << " failed"); HandleConnectionError(ctx, error, hive, true); + } else if (msg->ClientId == HiveClientId && ReconnectStartTime) + { + if (ReconnectPipeCounter) { + ReconnectPipeCounter->Add( + CyclesToDuration(GetCycleCount() - ReconnectStartTime).MicroSeconds()); + } + ReconnectStartTime = 0; } } @@ -227,6 +257,7 @@ void THiveProxyActor::HandleDisconnect( ui64 hive = msg->TabletId; ClientCache->OnDisconnect(ev); + HiveClientId = {}; auto error = MakeError(E_REJECTED, TStringBuilder() << "Disconnected from hive " << hive); @@ -242,6 +273,8 @@ void THiveProxyActor::HandleConnectionError( Y_UNUSED(error); Y_UNUSED(connectFailed); + HiveClientId = {}; + LOG_ERROR_S(ctx, LogComponent, "Pipe to hive" << hive << " has been reset "); @@ -307,6 +340,12 @@ void THiveProxyActor::HandleConnectionError( for (const auto& actorId: states->Actors) { auto clientId = ClientCache->Prepare(ctx, hive); + if (!HiveClientId) { + HiveClientId = clientId; + if (!ReconnectStartTime) { + ReconnectStartTime = GetCycleCount(); + } + } NCloud::Send( ctx, actorId, diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h index 75920d897e1..0350cbb08ff 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h @@ -177,14 +177,28 @@ class THiveProxyActor final const ui64 TenantHiveTabletId; + const NMonitoring::TDynamicCounterPtr CountersRoot; + NMonitoring::TDynamicCounters::TCounterPtr ReconnectPipeCounter; + ui64 ReconnectStartTime = 0; + NActors::TActorId HiveClientId; + public: explicit THiveProxyActor(THiveProxyConfig config); + THiveProxyActor( + THiveProxyConfig config, + NMonitoring::TDynamicCounterPtr countersRoot); + void Bootstrap(const NActors::TActorContext& ctx); private: STFUNC(StateWork); + void SendRequest( + const NActors::TActorContext& ctx, + ui64 hive, + NActors::IEventBase* request); + ui64 GetHive( const NActors::TActorContext& ctx, ui64 tabletId, diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp index 7d51f50452b..f40cf617abd 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp @@ -513,7 +513,8 @@ struct TTestEnv .TenantHiveTabletId = tenantHive, }; HiveProxyActorId = Runtime.Register( - CreateHiveProxy(std::move(config)).release()); + CreateHiveProxy( + std::move(config), Runtime.GetAppData(0).Counters).release()); Runtime.EnableScheduleForActor(HiveProxyActorId); Runtime.RegisterService(MakeHiveProxyServiceId(), HiveProxyActorId); } @@ -1541,6 +1542,45 @@ Y_UNIT_TEST_SUITE(THiveProxyTest) UNIT_ASSERT_VALUES_EQUAL(1, hiveMessages); UNIT_ASSERT_VALUES_EQUAL(1, wakeups); } + + Y_UNIT_TEST(ShouldReportHiveReconnectTime) + { + TTestBasicRuntime runtime; + TTestEnv env(runtime); + + auto sender = runtime.AllocateEdgeActor(); + + auto counter = env.Runtime.GetAppData(0).Counters + ->GetCounter("HiveReconnectTime", true); + + env.SendLockRequest(sender, FakeTablet2); + UNIT_ASSERT_VALUES_UNEQUAL(0, counter->Val()); + + auto oldVal = counter->Val(); + + int hiveLockRequests = 0; + runtime.SetObserverFunc([&](TAutoPtr& event) { + Y_UNUSED(runtime); + if (event->GetTypeRewrite() == TEvHive::EvLockTabletExecution) { + ++hiveLockRequests; + } + return TTestActorRuntime::EEventAction::PROCESS; + }); + + env.EnableTabletResolverScheduling(); + env.RebootHive(); + + for (int retries = 0; retries < 5 && !hiveLockRequests; ++retries) { + // Pipe to hive may take a long time to connect + // Wait until hive receives the lock request + runtime.DispatchEvents(TDispatchOptions(), TDuration::Seconds(1)); + } + + runtime.SetObserverFunc(&TTestActorRuntime::DefaultObserverFunc); + + // Rebooting hive should reconnect the lock + UNIT_ASSERT_GT(counter->Val(), oldVal); + } } } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/ya.make b/cloud/storage/core/libs/hive_proxy/ya.make index 15f253fb11c..794d095164e 100644 --- a/cloud/storage/core/libs/hive_proxy/ya.make +++ b/cloud/storage/core/libs/hive_proxy/ya.make @@ -27,6 +27,8 @@ PEERDIR( contrib/ydb/core/tablet_flat contrib/ydb/library/actors/core + + library/cpp/monlib/dynamic_counters ) END() From 8baac1cf762d9b32c5bad4b7c5a1198dc75256da Mon Sep 17 00:00:00 2001 From: Egor Masharskii Date: Tue, 4 Feb 2025 11:48:23 +0000 Subject: [PATCH 2/4] update --- .../core/libs/hive_proxy/hive_proxy.cpp | 10 +++----- .../storage/core/libs/hive_proxy/hive_proxy.h | 2 +- .../core/libs/hive_proxy/hive_proxy_actor.cpp | 24 +++++++++---------- .../core/libs/hive_proxy/hive_proxy_actor.h | 8 +++---- 4 files changed, 20 insertions(+), 24 deletions(-) diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp index 2f03716b145..9719bff23db 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy.cpp @@ -11,16 +11,12 @@ using namespace NActors; IActorPtr CreateHiveProxy(THiveProxyConfig config) { - if (config.FallbackMode) { - return std::make_unique(std::move(config)); - } - - return std::make_unique(std::move(config)); + return CreateHiveProxy(std::move(config), {}); } IActorPtr CreateHiveProxy( THiveProxyConfig config, - NMonitoring::TDynamicCounterPtr CountersRoot) + NMonitoring::TDynamicCounterPtr counters) { if (config.FallbackMode) { return std::make_unique(std::move(config)); @@ -28,7 +24,7 @@ IActorPtr CreateHiveProxy( return std::make_unique( std::move(config), - std::move(CountersRoot)); + std::move(counters)); } } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy.h b/cloud/storage/core/libs/hive_proxy/hive_proxy.h index 7f231bfa645..187fed6c310 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy.h +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy.h @@ -14,6 +14,6 @@ namespace NCloud::NStorage { NActors::IActorPtr CreateHiveProxy(THiveProxyConfig config); NActors::IActorPtr CreateHiveProxy( THiveProxyConfig config, - NMonitoring::TDynamicCounterPtr CountersRoot); + NMonitoring::TDynamicCounterPtr counters); } // namespace NCloud::NStorage diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp index 00623f5eab2..3deba7c81ce 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp @@ -36,13 +36,13 @@ std::unique_ptr CreateTabletPipeClientCache( THiveProxyActor::THiveProxyActor( THiveProxyConfig config, - NMonitoring::TDynamicCounterPtr countersRoot) + NMonitoring::TDynamicCounterPtr counters) : ClientCache(CreateTabletPipeClientCache(config)) , LockExpireTimeout(config.HiveLockExpireTimeout) , LogComponent(config.LogComponent) , TabletBootInfoBackupFilePath(config.TabletBootInfoBackupFilePath) , TenantHiveTabletId(config.TenantHiveTabletId) - , CountersRoot(std::move(countersRoot)) + , Counters(std::move(counters)) {} THiveProxyActor::THiveProxyActor(THiveProxyConfig config) @@ -62,8 +62,8 @@ void THiveProxyActor::Bootstrap(const TActorContext& ctx) TabletBootInfoBackup = ctx.Register( cache.release(), TMailboxType::HTSwap, AppData()->IOPoolId); } - if (CountersRoot) { - ReconnectPipeCounter = CountersRoot->GetCounter("HiveReconnectTime", true); + if (Counters) { + HiveReconnectTimeCounter = Counters->GetCounter("HiveReconnectTime", true); } } @@ -78,7 +78,7 @@ void THiveProxyActor::SendRequest( clientId != HiveClientId) { HiveClientId = clientId; - ReconnectStartTime = GetCycleCount(); + HiveReconnectStartTime = GetCycleCount(); } } @@ -239,13 +239,13 @@ void THiveProxyActor::HandleConnect( auto error = MakeKikimrError(msg->Status, TStringBuilder() << "Connect to hive " << hive << " failed"); HandleConnectionError(ctx, error, hive, true); - } else if (msg->ClientId == HiveClientId && ReconnectStartTime) + } else if (msg->ClientId == HiveClientId && HiveReconnectStartTime) { - if (ReconnectPipeCounter) { - ReconnectPipeCounter->Add( - CyclesToDuration(GetCycleCount() - ReconnectStartTime).MicroSeconds()); + if (HiveReconnectTimeCounter) { + HiveReconnectTimeCounter->Add( + CyclesToDuration(GetCycleCount() - HiveReconnectStartTime).MicroSeconds()); } - ReconnectStartTime = 0; + HiveReconnectStartTime = 0; } } @@ -342,8 +342,8 @@ void THiveProxyActor::HandleConnectionError( auto clientId = ClientCache->Prepare(ctx, hive); if (!HiveClientId) { HiveClientId = clientId; - if (!ReconnectStartTime) { - ReconnectStartTime = GetCycleCount(); + if (!HiveReconnectStartTime) { + HiveReconnectStartTime = GetCycleCount(); } } NCloud::Send( diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h index 0350cbb08ff..308dab94890 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h @@ -177,9 +177,9 @@ class THiveProxyActor final const ui64 TenantHiveTabletId; - const NMonitoring::TDynamicCounterPtr CountersRoot; - NMonitoring::TDynamicCounters::TCounterPtr ReconnectPipeCounter; - ui64 ReconnectStartTime = 0; + const NMonitoring::TDynamicCounterPtr Counters; + NMonitoring::TDynamicCounters::TCounterPtr HiveReconnectTimeCounter; + ui64 HiveReconnectStartTime = 0; NActors::TActorId HiveClientId; public: @@ -187,7 +187,7 @@ class THiveProxyActor final THiveProxyActor( THiveProxyConfig config, - NMonitoring::TDynamicCounterPtr countersRoot); + NMonitoring::TDynamicCounterPtr counters); void Bootstrap(const NActors::TActorContext& ctx); From e1ff782d26824e7764960ce7b7242805ddfed4d9 Mon Sep 17 00:00:00 2001 From: Egor Masharskii Date: Wed, 5 Feb 2025 16:48:33 +0000 Subject: [PATCH 3/4] update --- .../core/libs/hive_proxy/hive_proxy_actor.cpp | 25 ++++++++----------- .../core/libs/hive_proxy/hive_proxy_actor.h | 4 +-- .../core/libs/hive_proxy/hive_proxy_ut.cpp | 2 +- 3 files changed, 13 insertions(+), 18 deletions(-) diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp index 3deba7c81ce..7aeb7269913 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp @@ -74,11 +74,9 @@ void THiveProxyActor::SendRequest( ui64 hive, IEventBase* request) { - if (auto clientId = ClientCache->Send(ctx, hive, request); - clientId != HiveClientId) - { - HiveClientId = clientId; - HiveReconnectStartTime = GetCycleCount(); + ClientCache->Send(ctx, hive, request); + if (HiveDisconnected) { + HiveReconnectStartCycles = GetCycleCount(); } } @@ -239,13 +237,14 @@ void THiveProxyActor::HandleConnect( auto error = MakeKikimrError(msg->Status, TStringBuilder() << "Connect to hive " << hive << " failed"); HandleConnectionError(ctx, error, hive, true); - } else if (msg->ClientId == HiveClientId && HiveReconnectStartTime) + } else if (HiveReconnectStartCycles) { if (HiveReconnectTimeCounter) { HiveReconnectTimeCounter->Add( - CyclesToDuration(GetCycleCount() - HiveReconnectStartTime).MicroSeconds()); + CyclesToDuration(GetCycleCount() - HiveReconnectStartCycles).MicroSeconds()); } - HiveReconnectStartTime = 0; + HiveReconnectStartCycles = 0; + HiveDisconnected = false; } } @@ -257,7 +256,6 @@ void THiveProxyActor::HandleDisconnect( ui64 hive = msg->TabletId; ClientCache->OnDisconnect(ev); - HiveClientId = {}; auto error = MakeError(E_REJECTED, TStringBuilder() << "Disconnected from hive " << hive); @@ -273,7 +271,7 @@ void THiveProxyActor::HandleConnectionError( Y_UNUSED(error); Y_UNUSED(connectFailed); - HiveClientId = {}; + HiveDisconnected = true; LOG_ERROR_S(ctx, LogComponent, "Pipe to hive" << hive << " has been reset "); @@ -340,11 +338,8 @@ void THiveProxyActor::HandleConnectionError( for (const auto& actorId: states->Actors) { auto clientId = ClientCache->Prepare(ctx, hive); - if (!HiveClientId) { - HiveClientId = clientId; - if (!HiveReconnectStartTime) { - HiveReconnectStartTime = GetCycleCount(); - } + if (!HiveReconnectStartCycles) { + HiveReconnectStartCycles = GetCycleCount(); } NCloud::Send( ctx, diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h index 308dab94890..04b54f7efe2 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h @@ -179,8 +179,8 @@ class THiveProxyActor final const NMonitoring::TDynamicCounterPtr Counters; NMonitoring::TDynamicCounters::TCounterPtr HiveReconnectTimeCounter; - ui64 HiveReconnectStartTime = 0; - NActors::TActorId HiveClientId; + ui64 HiveReconnectStartCycles = 0; + bool HiveDisconnected = true; public: explicit THiveProxyActor(THiveProxyConfig config); diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp index f40cf617abd..f263f5997f0 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp @@ -1570,7 +1570,7 @@ Y_UNIT_TEST_SUITE(THiveProxyTest) env.EnableTabletResolverScheduling(); env.RebootHive(); - for (int retries = 0; retries < 5 && !hiveLockRequests; ++retries) { + while (!hiveLockRequests) { // Pipe to hive may take a long time to connect // Wait until hive receives the lock request runtime.DispatchEvents(TDispatchOptions(), TDuration::Seconds(1)); From 51b3ffc73dab329242674cd1bee992655bc0a84c Mon Sep 17 00:00:00 2001 From: Egor Masharskii Date: Thu, 6 Feb 2025 09:01:47 +0000 Subject: [PATCH 4/4] update --- cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp | 6 +++--- cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp index 7aeb7269913..e9e5c0616f6 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp @@ -237,11 +237,11 @@ void THiveProxyActor::HandleConnect( auto error = MakeKikimrError(msg->Status, TStringBuilder() << "Connect to hive " << hive << " failed"); HandleConnectionError(ctx, error, hive, true); - } else if (HiveReconnectStartCycles) - { + } else if (HiveReconnectStartCycles) { if (HiveReconnectTimeCounter) { HiveReconnectTimeCounter->Add( - CyclesToDuration(GetCycleCount() - HiveReconnectStartCycles).MicroSeconds()); + CyclesToDuration( + GetCycleCount() - HiveReconnectStartCycles).MicroSeconds()); } HiveReconnectStartCycles = 0; HiveDisconnected = false; diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp index f263f5997f0..20c292e4950 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp @@ -514,7 +514,8 @@ struct TTestEnv }; HiveProxyActorId = Runtime.Register( CreateHiveProxy( - std::move(config), Runtime.GetAppData(0).Counters).release()); + std::move(config), + Runtime.GetAppData(0).Counters).release()); Runtime.EnableScheduleForActor(HiveProxyActorId); Runtime.RegisterService(MakeHiveProxyServiceId(), HiveProxyActorId); }