Skip to content

Commit

Permalink
issue-2969: add hive reconnect time counter (#2970)
Browse files Browse the repository at this point in the history
* issue-2969: add hive reconnect time counter

* update

* update

* update
  • Loading branch information
yegorskii authored Feb 6, 2025
1 parent 9c3f69c commit 33359ef
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 35 deletions.
23 changes: 14 additions & 9 deletions cloud/blockstore/libs/storage/init/disk_agent/actorsystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,20 @@ class TStorageServicesInitializer final
// HiveProxy
//

auto hiveProxy = CreateHiveProxy({
.PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(),
.PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(),
.HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(),
.LogComponent = TBlockStoreComponents::HIVE_PROXY,
.TabletBootInfoBackupFilePath = {},
.FallbackMode = false,
.TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(),
});
auto hiveProxy = CreateHiveProxy(
{
.PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(),
.PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(),
.HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(),
.LogComponent = TBlockStoreComponents::HIVE_PROXY,
.TabletBootInfoBackupFilePath = {},
.FallbackMode = false,
.TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(),
},
appData
->Counters
->GetSubgroup("counters", "blockstore")
->GetSubgroup("component", "service"));

setup->LocalServices.emplace_back(
MakeHiveProxyServiceId(),
Expand Down
23 changes: 14 additions & 9 deletions cloud/blockstore/libs/storage/init/server/actorsystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,15 +138,20 @@ class TStorageServicesInitializer final
// HiveProxy
//

auto hiveProxy = CreateHiveProxy({
.PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(),
.PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(),
.HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(),
.LogComponent = TBlockStoreComponents::HIVE_PROXY,
.TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(),
.FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(),
.TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(),
});
auto hiveProxy = CreateHiveProxy(
{
.PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(),
.PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(),
.HiveLockExpireTimeout = Args.StorageConfig->GetHiveLockExpireTimeout(),
.LogComponent = TBlockStoreComponents::HIVE_PROXY,
.TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(),
.FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(),
.TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(),
},
appData
->Counters
->GetSubgroup("counters", "blockstore")
->GetSubgroup("component", "service"));

setup->LocalServices.emplace_back(
MakeHiveProxyServiceId(),
Expand Down
25 changes: 15 additions & 10 deletions cloud/filestore/libs/storage/init/actorsystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,16 +124,21 @@ class TStorageServicesInitializer final
// HiveProxy
//

auto hiveProxy = CreateHiveProxy({
.PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(),
.PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(),
// HiveLockExpireTimeout, used by NBS, doesn't matter
.HiveLockExpireTimeout = TDuration::Seconds(1),
.LogComponent = TFileStoreComponents::HIVE_PROXY,
.TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(),
.FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(),
.TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(),
});
auto hiveProxy = CreateHiveProxy(
{
.PipeClientRetryCount = Args.StorageConfig->GetPipeClientRetryCount(),
.PipeClientMinRetryTime = Args.StorageConfig->GetPipeClientMinRetryTime(),
// HiveLockExpireTimeout, used by NBS, doesn't matter
.HiveLockExpireTimeout = TDuration::Seconds(1),
.LogComponent = TFileStoreComponents::HIVE_PROXY,
.TabletBootInfoBackupFilePath = Args.StorageConfig->GetTabletBootInfoBackupFilePath(),
.FallbackMode = Args.StorageConfig->GetHiveProxyFallbackMode(),
.TenantHiveTabletId = Args.StorageConfig->GetTenantHiveTabletId(),
},
appData
->Counters
->GetSubgroup("counters", "filestore")
->GetSubgroup("component", "service"));

setup->LocalServices.emplace_back(
MakeHiveProxyServiceId(),
Expand Down
11 changes: 10 additions & 1 deletion cloud/storage/core/libs/hive_proxy/hive_proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,21 @@ using namespace NActors;
////////////////////////////////////////////////////////////////////////////////

IActorPtr CreateHiveProxy(THiveProxyConfig config)
{
return CreateHiveProxy(std::move(config), {});
}

IActorPtr CreateHiveProxy(
THiveProxyConfig config,
NMonitoring::TDynamicCounterPtr counters)
{
if (config.FallbackMode) {
return std::make_unique<THiveProxyFallbackActor>(std::move(config));
}

return std::make_unique<THiveProxyActor>(std::move(config));
return std::make_unique<THiveProxyActor>(
std::move(config),
std::move(counters));
}

} // namespace NCloud::NStorage
5 changes: 5 additions & 0 deletions cloud/storage/core/libs/hive_proxy/hive_proxy.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,15 @@
#include <cloud/storage/core/libs/actors/public.h>
#include <cloud/storage/core/libs/common/public.h>

#include <library/cpp/monlib/dynamic_counters/counters.h>

namespace NCloud::NStorage {

////////////////////////////////////////////////////////////////////////////////

NActors::IActorPtr CreateHiveProxy(THiveProxyConfig config);
NActors::IActorPtr CreateHiveProxy(
THiveProxyConfig config,
NMonitoring::TDynamicCounterPtr counters);

} // namespace NCloud::NStorage
44 changes: 39 additions & 5 deletions cloud/storage/core/libs/hive_proxy/hive_proxy_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,19 @@ std::unique_ptr<NTabletPipe::IClientCache> CreateTabletPipeClientCache(

////////////////////////////////////////////////////////////////////////////////

THiveProxyActor::THiveProxyActor(THiveProxyConfig config)
THiveProxyActor::THiveProxyActor(
THiveProxyConfig config,
NMonitoring::TDynamicCounterPtr counters)
: ClientCache(CreateTabletPipeClientCache(config))
, LockExpireTimeout(config.HiveLockExpireTimeout)
, LogComponent(config.LogComponent)
, TabletBootInfoBackupFilePath(config.TabletBootInfoBackupFilePath)
, TenantHiveTabletId(config.TenantHiveTabletId)
, Counters(std::move(counters))
{}

THiveProxyActor::THiveProxyActor(THiveProxyConfig config)
: THiveProxyActor(std::move(config), {})
{}

void THiveProxyActor::Bootstrap(const TActorContext& ctx)
Expand All @@ -55,10 +62,24 @@ void THiveProxyActor::Bootstrap(const TActorContext& ctx)
TabletBootInfoBackup = ctx.Register(
cache.release(), TMailboxType::HTSwap, AppData()->IOPoolId);
}
if (Counters) {
HiveReconnectTimeCounter = Counters->GetCounter("HiveReconnectTime", true);
}
}

////////////////////////////////////////////////////////////////////////////////

void THiveProxyActor::SendRequest(
const TActorContext& ctx,
ui64 hive,
IEventBase* request)
{
ClientCache->Send(ctx, hive, request);
if (HiveDisconnected) {
HiveReconnectStartCycles = GetCycleCount();
}
}

ui64 THiveProxyActor::GetHive(
const TActorContext& ctx,
ui64 tabletId,
Expand Down Expand Up @@ -98,15 +119,15 @@ void THiveProxyActor::SendLockRequest(
hiveRequest->Record.SetMaxReconnectTimeout(
LockExpireTimeout.MilliSeconds());
hiveRequest->Record.SetReconnect(reconnect);
ClientCache->Send(ctx, hive, hiveRequest.release());
SendRequest(ctx, hive, hiveRequest.release());
}

void THiveProxyActor::SendUnlockRequest(
const TActorContext& ctx, ui64 hive, ui64 tabletId)
{
auto hiveRequest =
std::make_unique<TEvHive::TEvUnlockTabletExecution>(tabletId);
ClientCache->Send(ctx, hive, hiveRequest.release());
SendRequest(ctx, hive, hiveRequest.release());
}

void THiveProxyActor::SendGetTabletStorageInfoRequest(
Expand All @@ -115,7 +136,7 @@ void THiveProxyActor::SendGetTabletStorageInfoRequest(
{
auto hiveRequest =
std::make_unique<TEvHive::TEvGetTabletStorageInfo>(tabletId);
ClientCache->Send(ctx, hive, hiveRequest.release());
SendRequest(ctx, hive, hiveRequest.release());
}

void THiveProxyActor::SendLockReply(
Expand Down Expand Up @@ -198,7 +219,7 @@ void THiveProxyActor::SendTabletMetrics(
prTabletId.second.OnStatsSend();
}
if (record.TabletMetricsSize() > 0) {
ClientCache->Send(ctx, hive, event.Release());
SendRequest(ctx, hive, event.Release());
}
}

Expand All @@ -216,6 +237,14 @@ void THiveProxyActor::HandleConnect(
auto error = MakeKikimrError(msg->Status, TStringBuilder()
<< "Connect to hive " << hive << " failed");
HandleConnectionError(ctx, error, hive, true);
} else if (HiveReconnectStartCycles) {
if (HiveReconnectTimeCounter) {
HiveReconnectTimeCounter->Add(
CyclesToDuration(
GetCycleCount() - HiveReconnectStartCycles).MicroSeconds());
}
HiveReconnectStartCycles = 0;
HiveDisconnected = false;
}
}

Expand All @@ -242,6 +271,8 @@ void THiveProxyActor::HandleConnectionError(
Y_UNUSED(error);
Y_UNUSED(connectFailed);

HiveDisconnected = true;

LOG_ERROR_S(ctx, LogComponent,
"Pipe to hive" << hive << " has been reset ");

Expand Down Expand Up @@ -307,6 +338,9 @@ void THiveProxyActor::HandleConnectionError(

for (const auto& actorId: states->Actors) {
auto clientId = ClientCache->Prepare(ctx, hive);
if (!HiveReconnectStartCycles) {
HiveReconnectStartCycles = GetCycleCount();
}
NCloud::Send<TEvHiveProxyPrivate::TEvChangeTabletClient>(
ctx,
actorId,
Expand Down
14 changes: 14 additions & 0 deletions cloud/storage/core/libs/hive_proxy/hive_proxy_actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,28 @@ class THiveProxyActor final

const ui64 TenantHiveTabletId;

const NMonitoring::TDynamicCounterPtr Counters;
NMonitoring::TDynamicCounters::TCounterPtr HiveReconnectTimeCounter;
ui64 HiveReconnectStartCycles = 0;
bool HiveDisconnected = true;

public:
explicit THiveProxyActor(THiveProxyConfig config);

THiveProxyActor(
THiveProxyConfig config,
NMonitoring::TDynamicCounterPtr counters);

void Bootstrap(const NActors::TActorContext& ctx);

private:
STFUNC(StateWork);

void SendRequest(
const NActors::TActorContext& ctx,
ui64 hive,
NActors::IEventBase* request);

ui64 GetHive(
const NActors::TActorContext& ctx,
ui64 tabletId,
Expand Down
43 changes: 42 additions & 1 deletion cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,9 @@ struct TTestEnv
.TenantHiveTabletId = tenantHive,
};
HiveProxyActorId = Runtime.Register(
CreateHiveProxy(std::move(config)).release());
CreateHiveProxy(
std::move(config),
Runtime.GetAppData(0).Counters).release());
Runtime.EnableScheduleForActor(HiveProxyActorId);
Runtime.RegisterService(MakeHiveProxyServiceId(), HiveProxyActorId);
}
Expand Down Expand Up @@ -1541,6 +1543,45 @@ Y_UNIT_TEST_SUITE(THiveProxyTest)
UNIT_ASSERT_VALUES_EQUAL(1, hiveMessages);
UNIT_ASSERT_VALUES_EQUAL(1, wakeups);
}

Y_UNIT_TEST(ShouldReportHiveReconnectTime)
{
TTestBasicRuntime runtime;
TTestEnv env(runtime);

auto sender = runtime.AllocateEdgeActor();

auto counter = env.Runtime.GetAppData(0).Counters
->GetCounter("HiveReconnectTime", true);

env.SendLockRequest(sender, FakeTablet2);
UNIT_ASSERT_VALUES_UNEQUAL(0, counter->Val());

auto oldVal = counter->Val();

int hiveLockRequests = 0;
runtime.SetObserverFunc([&](TAutoPtr<IEventHandle>& event) {
Y_UNUSED(runtime);
if (event->GetTypeRewrite() == TEvHive::EvLockTabletExecution) {
++hiveLockRequests;
}
return TTestActorRuntime::EEventAction::PROCESS;
});

env.EnableTabletResolverScheduling();
env.RebootHive();

while (!hiveLockRequests) {
// Pipe to hive may take a long time to connect
// Wait until hive receives the lock request
runtime.DispatchEvents(TDispatchOptions(), TDuration::Seconds(1));
}

runtime.SetObserverFunc(&TTestActorRuntime::DefaultObserverFunc);

// Rebooting hive should reconnect the lock
UNIT_ASSERT_GT(counter->Val(), oldVal);
}
}

} // namespace NCloud::NStorage
2 changes: 2 additions & 0 deletions cloud/storage/core/libs/hive_proxy/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ PEERDIR(
contrib/ydb/core/tablet_flat

contrib/ydb/library/actors/core

library/cpp/monlib/dynamic_counters
)

END()
Expand Down

0 comments on commit 33359ef

Please sign in to comment.