Skip to content

Commit

Permalink
Write critical event when mismatch happened
Browse files Browse the repository at this point in the history
  • Loading branch information
drbasic committed Feb 10, 2025
1 parent 5810363 commit dd87238
Show file tree
Hide file tree
Showing 13 changed files with 70 additions and 9 deletions.
3 changes: 2 additions & 1 deletion cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ namespace NCloud::NBlockStore {
xxx(MirroredDiskDeviceReplacementRateLimitExceeded) \
xxx(MirroredDiskMinorityChecksumMismatch) \
xxx(MirroredDiskMajorityChecksumMismatch) \
xxx(MirroredDiskChecksumMismatchUponRead) \
xxx(MirroredDiskChecksumMismatchUponWrite) \
xxx(CounterUpdateRace) \
xxx(EndpointStartingError) \
xxx(ResyncFailed) \
Expand Down Expand Up @@ -64,7 +66,6 @@ namespace NCloud::NBlockStore {
xxx(DiskRegistryPurgeHostError) \
xxx(DiskRegistryCleanupAgentConfigError) \
xxx(DiskRegistryOccupiedDeviceConfigurationHasChanged) \
xxx(MirroredDiskChecksumMismatchUponRead) \
xxx(DiskRegistryWrongMigratedDeviceOwnership) \
xxx(DiskRegistryInitialAgentRejectionThresholdExceeded) \
// BLOCKSTORE_CRITICAL_EVENTS
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/nbd/server_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,7 @@ IServerHandlerFactoryPtr CreateServerHandlerFactory(
{
auto deviceHandler = deviceHandlerFactory->CreateDeviceHandler(
std::move(storage),
options.DiskId,
options.ClientId,
options.BlockSize,
options.UnalignedRequestsDisabled,
Expand Down
8 changes: 5 additions & 3 deletions cloud/blockstore/libs/service/aligned_device_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,16 @@ TBlocksInfo TBlocksInfo::MakeAligned() const

TAlignedDeviceHandler::TAlignedDeviceHandler(
IStoragePtr storage,
TString diskId,
TString clientId,
ui32 blockSize,
ui32 maxSubRequestSize,
bool checkBufferModificationDuringWriting)
: Storage(
checkBufferModificationDuringWriting
? CreateChecksumStorageWrapper(std::move(storage))
: std::move(storage))
checkBufferModificationDuringWriting ? CreateChecksumStorageWrapper(
std::move(storage),
std::move(diskId))
: std::move(storage))
, ClientId(std::move(clientId))
, BlockSize(blockSize)
, MaxBlockCount(maxSubRequestSize / BlockSize)
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/service/aligned_device_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class TAlignedDeviceHandler final
public:
TAlignedDeviceHandler(
IStoragePtr storage,
TString diskId,
TString clientId,
ui32 blockSize,
ui32 maxSubRequestSize,
Expand Down
18 changes: 15 additions & 3 deletions cloud/blockstore/libs/service/checksum_storage_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#include "storage.h"

#include <cloud/blockstore/libs/common/block_checksum.h>
#include <cloud/blockstore/libs/common/block_range.h>
#include <cloud/blockstore/libs/common/iovector.h>
#include <cloud/blockstore/libs/diagnostics/critical_events.h>
#include <cloud/blockstore/libs/service/context.h>
#include <cloud/storage/core/libs/common/error.h>
#include <cloud/storage/core/libs/common/sglist.h>
Expand Down Expand Up @@ -64,10 +66,12 @@ class TChecksumStorageWrapper final
, public IStorage
{
const IStoragePtr Storage;
const TString DiskId;

public:
explicit TChecksumStorageWrapper(IStoragePtr storage)
TChecksumStorageWrapper(IStoragePtr storage, TString diskId)
: Storage(std::move(storage))
, DiskId(std::move(diskId))
{}

TFuture<NProto::TZeroBlocksResponse> ZeroBlocks(
Expand Down Expand Up @@ -165,6 +169,12 @@ TChecksumStorageWrapper::RetryWriteBlocksLocal(
TCallContextPtr callContext,
std::shared_ptr<NProto::TWriteBlocksLocalRequest> request)
{
const auto range = TBlockRange64::WithLength(
request->GetStartIndex(),
request->BlocksCount);
ReportMirroredDiskChecksumMismatchUponWrite(
TStringBuilder() << "d:" << DiskId << ", r:" << range);

auto guard = request->Sglist.Acquire();
if (!guard) {
return MakeFuture<NProto::TWriteBlocksLocalResponse>(
Expand All @@ -190,9 +200,11 @@ TChecksumStorageWrapper::RetryWriteBlocksLocal(

////////////////////////////////////////////////////////////////////////////////

IStoragePtr CreateChecksumStorageWrapper(IStoragePtr storage)
IStoragePtr CreateChecksumStorageWrapper(IStoragePtr storage, TString diskId)
{
return std::make_shared<TChecksumStorageWrapper>(std::move(storage));
return std::make_shared<TChecksumStorageWrapper>(
std::move(storage),
std::move(diskId));
}

} // namespace NCloud::NBlockStore
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/service/checksum_storage_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace NCloud::NBlockStore {

////////////////////////////////////////////////////////////////////////////////

IStoragePtr CreateChecksumStorageWrapper(IStoragePtr storage);
IStoragePtr CreateChecksumStorageWrapper(IStoragePtr storage, TString diskId);

////////////////////////////////////////////////////////////////////////////////

Expand Down
3 changes: 3 additions & 0 deletions cloud/blockstore/libs/service/device_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ struct TDefaultDeviceHandlerFactory final

IDeviceHandlerPtr CreateDeviceHandler(
IStoragePtr storage,
TString diskId,
TString clientId,
ui32 blockSize,
bool unalignedRequestsDisabled,
Expand All @@ -38,6 +39,7 @@ struct TDefaultDeviceHandlerFactory final
if (unalignedRequestsDisabled) {
return std::make_shared<TAlignedDeviceHandler>(
std::move(storage),
std::move(diskId),
std::move(clientId),
blockSize,
MaxSubRequestSize,
Expand All @@ -46,6 +48,7 @@ struct TDefaultDeviceHandlerFactory final

return std::make_shared<TUnalignedDeviceHandler>(
std::move(storage),
std::move(diskId),
std::move(clientId),
blockSize,
MaxSubRequestSize,
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/service/device_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct IDeviceHandlerFactory

virtual IDeviceHandlerPtr CreateDeviceHandler(
IStoragePtr storage,
TString diskId,
TString clientId,
ui32 blockSize,
bool unalignedRequestsDisabled,
Expand Down
34 changes: 33 additions & 1 deletion cloud/blockstore/libs/service/device_handler_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
#include <cloud/blockstore/libs/service/context.h>
#include <cloud/blockstore/libs/service/storage_test.h>
#include <cloud/storage/core/libs/common/error.h>

#include <cloud/storage/core/libs/common/sglist.h>
#include <cloud/storage/core/libs/common/sglist_test.h>
#include <cloud/storage/core/libs/diagnostics/critical_events.h>
#include <cloud/storage/core/libs/diagnostics/monitoring.h>

#include <library/cpp/monlib/dynamic_counters/counters.h>
#include <library/cpp/testing/unittest/registar.h>

#include <array>
Expand All @@ -20,6 +22,15 @@ namespace {

////////////////////////////////////////////////////////////////////////////////

auto SetupCriticalEvents()
{
NMonitoring::TDynamicCountersPtr counters =
new NMonitoring::TDynamicCounters();
InitCriticalEventsCounter(counters);
return counters;
}

////////////////////////////////////////////////////////////////////////////////
class TTestEnvironment
{
private:
Expand Down Expand Up @@ -126,6 +137,7 @@ class TTestEnvironment
auto factory = CreateDeviceHandlerFactory(maxBlockCount * BlockSize);
DeviceHandler = factory->CreateDeviceHandler(
std::move(testStorage),
"disk1",
"testClientId",
BlockSize,
unalignedRequestsDisabled, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -330,6 +342,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)

Y_UNIT_TEST(ShouldSliceHugeZeroRequest)
{
const auto diskId = "disk1";
const auto clientId = "testClientId";
const ui32 blockSize = DefaultBlockSize;
const ui64 deviceBlocksCount = 8*1024;
Expand All @@ -340,6 +353,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)
auto factory = CreateDeviceHandlerFactory(blocksCountLimit * blockSize);
auto deviceHandler = factory->CreateDeviceHandler(
storage,
diskId,
clientId,
blockSize,
false, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -397,13 +411,15 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)

Y_UNIT_TEST(ShouldHandleAlignedRequestsWhenUnalignedRequestsDisabled)
{
const auto diskId = "disk1";
const auto clientId = "testClientId";
const ui32 blockSize = DefaultBlockSize;

auto storage = std::make_shared<TTestStorage>();

auto device = CreateDefaultDeviceHandlerFactory()->CreateDeviceHandler(
storage,
diskId,
clientId,
blockSize,
true, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -485,13 +501,15 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)

Y_UNIT_TEST(ShouldNotHandleUnalignedRequestsWhenUnalignedRequestsDisabled)
{
const auto diskId = "disk1";
const auto clientId = "testClientId";
const ui32 blockSize = DefaultBlockSize;

auto storage = std::make_shared<TTestStorage>();

auto device = CreateDefaultDeviceHandlerFactory()->CreateDeviceHandler(
storage,
diskId,
clientId,
blockSize,
true, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -594,6 +612,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)

void DoShouldSliceHugeZeroRequest(bool requestUnaligned, bool unalignedRequestDisabled)
{
const auto diskId = "disk1";
const auto clientId = "testClientId";
const ui32 blockSize = DefaultBlockSize;
const ui64 deviceBlocksCount = 12;
Expand All @@ -607,6 +626,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)
auto factory = CreateDeviceHandlerFactory(blocksCountLimit * blockSize);
auto deviceHandler = factory->CreateDeviceHandler(
storage,
diskId,
clientId,
blockSize,
unalignedRequestDisabled, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -716,6 +736,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)

Y_UNIT_TEST(ShouldReturnErrorForHugeUnalignedReadWriteRequests)
{
const auto diskId = "disk1";
const auto clientId = "testClientId";
const ui32 blockSize = DefaultBlockSize;

Expand All @@ -724,6 +745,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)
auto deviceHandler =
CreateDefaultDeviceHandlerFactory()->CreateDeviceHandler(
storage,
diskId,
clientId,
blockSize,
false, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -782,6 +804,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)

Y_UNIT_TEST(ShouldReturnErrorForInvalidBufferSize)
{
const auto diskId = "disk1";
const auto clientId = "testClientId";
const ui32 blockSize = DefaultBlockSize;

Expand All @@ -790,6 +813,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)
auto deviceHandler =
CreateDefaultDeviceHandlerFactory()->CreateDeviceHandler(
storage,
diskId,
clientId,
blockSize,
false, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -905,6 +929,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)

Y_UNIT_TEST(ShouldCopyBufferWhenClientModifyBuffer)
{
const auto diskId = "disk1";
const auto clientId = "testClientId";
const ui32 blockSize = DefaultBlockSize;
const ui64 deviceBlocksCount = 8*1024;
Expand All @@ -915,6 +940,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)
auto factory = CreateDeviceHandlerFactory(blocksCountLimit * blockSize);
auto deviceHandler = factory->CreateDeviceHandler(
storage,
diskId,
clientId,
blockSize,
false, // unalignedRequestsDisabled,
Expand Down Expand Up @@ -945,6 +971,11 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)
return MakeFuture<NProto::TWriteBlocksLocalResponse>();
};

auto counters = SetupCriticalEvents();
auto mirroredDiskChecksumMismatchUponWrite = counters->GetCounter(
"AppCriticalEvents/MirroredDiskChecksumMismatchUponWrite",
true);

auto future = deviceHandler->Write(
MakeIntrusive<TCallContext>(),
0,
Expand All @@ -954,6 +985,7 @@ Y_UNIT_TEST_SUITE(TDeviceHandlerTest)
const auto& response = future.GetValue(TDuration::Seconds(5));
UNIT_ASSERT(!HasError(response));
UNIT_ASSERT_VALUES_EQUAL(2, writeAttempts);
UNIT_ASSERT_VALUES_EQUAL(1, mirroredDiskChecksumMismatchUponWrite->Val());
}
}

Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/service/unaligned_device_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,13 +468,15 @@ TZeroRequest::TResponseFuture TZeroRequest::ModifyAndWrite()

TUnalignedDeviceHandler::TUnalignedDeviceHandler(
IStoragePtr storage,
TString diskId,
TString clientId,
ui32 blockSize,
ui32 maxSubRequestSize,
ui32 maxUnalignedRequestSize,
bool checkBufferModificationDuringWriting)
: Backend(std::make_shared<TAlignedDeviceHandler>(
std::move(storage),
std::move(diskId),
std::move(clientId),
blockSize,
maxSubRequestSize,
Expand Down
1 change: 1 addition & 0 deletions cloud/blockstore/libs/service/unaligned_device_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class TUnalignedDeviceHandler final
public:
TUnalignedDeviceHandler(
IStoragePtr storage,
TString diskId,
TString clientId,
ui32 blockSize,
ui32 maxSubRequestSize,
Expand Down
4 changes: 4 additions & 0 deletions cloud/blockstore/libs/service/ut/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,8 @@ SRCS(
storage_ut.cpp
)

PEERDIR(
cloud/blockstore/libs/diagnostics
)

END()
1 change: 1 addition & 0 deletions cloud/blockstore/libs/vhost/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ class TExecutor final
{
auto deviceHandler = AppCtx.DeviceHandlerFactory->CreateDeviceHandler(
std::move(storage),
options.DiskId,
options.ClientId,
options.BlockSize,
options.UnalignedRequestsDisabled,
Expand Down

0 comments on commit dd87238

Please sign in to comment.