diff --git a/cloud/filestore/config/filesystem.proto b/cloud/filestore/config/filesystem.proto index 6837b895f57..bbe6bfb6e96 100644 --- a/cloud/filestore/config/filesystem.proto +++ b/cloud/filestore/config/filesystem.proto @@ -88,4 +88,10 @@ message TFileSystemConfig // Disable fsync queue which is used to synchronize data/metadata ops during fsync optional bool FSyncQueueDisabled = 26; + + // Enable directory handles storage functionality + optional bool DirectoryHandlesStorageEnabled = 27; + + // Capacity for DirectoryHandles persistent table. + optional uint64 DirectoryHandlesTableSize = 28; } diff --git a/cloud/filestore/config/server.proto b/cloud/filestore/config/server.proto index cf32af237c2..2be3a625944 100644 --- a/cloud/filestore/config/server.proto +++ b/cloud/filestore/config/server.proto @@ -171,6 +171,12 @@ message TLocalServiceConfig // Disable fsync queue which is used to synchronize data/metadata ops during fsync optional bool FSyncQueueDisabled = 29; + + // Enable directory handles storage functionality + optional bool DirectoryHandlesStorageEnabled = 30; + + // Capacity for DirectoryHandles persistent table. + optional uint64 DirectoryHandlesTableSize = 31; } //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/filestore/config/storage.proto b/cloud/filestore/config/storage.proto index f2501b1d25a..51f780e7ca3 100644 --- a/cloud/filestore/config/storage.proto +++ b/cloud/filestore/config/storage.proto @@ -664,4 +664,9 @@ message TStorageConfig // Disable fsync queue which is used to synchronize data/metadata ops during fsync optional bool FSyncQueueDisabled = 465; + // Enable directory handles storage functionality. + optional bool DirectoryHandlesStorageEnabled = 466; + + // Capacity for DirectoryHandles persistent table. + optional uint64 DirectoryHandlesTableSize = 467; } diff --git a/cloud/filestore/config/vfs.proto b/cloud/filestore/config/vfs.proto index 117ccc43812..d3c3e118362 100644 --- a/cloud/filestore/config/vfs.proto +++ b/cloud/filestore/config/vfs.proto @@ -73,4 +73,12 @@ message TVFSConfig // operation. If this limit is exceeded, flush will be split into // a sequence of multiple flush operations. optional uint64 WriteBackCacheFlushMaxSumWriteRequestsSize = 21; + + // Path to the directory where DirectoryHandles storage is stored for all sessions. + // Will create a directory with filesystem id as name + // (and a subdirectory with session id as name) if not present. + optional string DirectoryHandlesStoragePath = 22; + + // Initial data area size for DirectoryHandles persistent table. + optional uint64 DirectoryHandlesInitialDataSize = 23; } diff --git a/cloud/filestore/config/vhost.proto b/cloud/filestore/config/vhost.proto index 3a2126cc473..ba8769ba35a 100644 --- a/cloud/filestore/config/vhost.proto +++ b/cloud/filestore/config/vhost.proto @@ -75,6 +75,14 @@ message TVhostServiceConfig // operation. If this limit is exceeded, flush will be split into // a sequence of multiple flush operations. optional uint64 WriteBackCacheFlushMaxSumWriteRequestsSize = 18; + + // Path to the directory where DirectoryHandles storage is stored for all sessions. + // Will create a directory with filesystem id as name + // (and a subdirectory with session id as name) if not present. + optional string DirectoryHandlesStoragePath = 19; + + // Initial data area size for DirectoryHandles persistent table. + optional uint64 DirectoryHandlesInitialDataSize = 20; } //////////////////////////////////////////////////////////////////////////////// diff --git a/cloud/filestore/libs/daemon/vhost/bootstrap.cpp b/cloud/filestore/libs/daemon/vhost/bootstrap.cpp index 82e43b8a1e3..e8dc540450b 100644 --- a/cloud/filestore/libs/daemon/vhost/bootstrap.cpp +++ b/cloud/filestore/libs/daemon/vhost/bootstrap.cpp @@ -419,7 +419,12 @@ void TBootstrapVhost::InitEndpoints() ->GetWriteBackCacheFlushMaxWriteRequestsCount(), .FlushMaxSumWriteRequestsSize = Configs->VhostServiceConfig - ->GetWriteBackCacheFlushMaxSumWriteRequestsSize(), + ->GetWriteBackCacheFlushMaxSumWriteRequestsSize() + }, + TDirectoryHandlesStorageConfig{ + .PathPrefix = Configs->VhostServiceConfig->GetDirectoryHandlesStoragePath(), + .InitialDataSize = + Configs->VhostServiceConfig->GetDirectoryHandlesInitialDataSize() } ); diff --git a/cloud/filestore/libs/diagnostics/critical_events.h b/cloud/filestore/libs/diagnostics/critical_events.h index ba1dc64ca36..5072be21c08 100644 --- a/cloud/filestore/libs/diagnostics/critical_events.h +++ b/cloud/filestore/libs/diagnostics/critical_events.h @@ -36,6 +36,7 @@ namespace NCloud::NFileStore{ xxx(InvalidShardNo) \ xxx(WriteBackCacheCreatingOrDeletingError) \ xxx(ErrorWasSentToTheGuest) \ + xxx(DirectoryHandlesStorageError) \ // FILESTORE_CRITICAL_EVENTS #define FILESTORE_IMPOSSIBLE_EVENTS(xxx) \ diff --git a/cloud/filestore/libs/endpoint_vhost/config.cpp b/cloud/filestore/libs/endpoint_vhost/config.cpp index 7bd0cb143b4..2b2a74d4c0f 100644 --- a/cloud/filestore/libs/endpoint_vhost/config.cpp +++ b/cloud/filestore/libs/endpoint_vhost/config.cpp @@ -42,6 +42,8 @@ static constexpr int MODE0660 = S_IRGRP | S_IWGRP | S_IRUSR | S_IWUSR; xxx(WriteBackCacheFlushMaxWriteRequestSize, ui32, 1_MB )\ xxx(WriteBackCacheFlushMaxWriteRequestsCount, ui32, 64 )\ xxx(WriteBackCacheFlushMaxSumWriteRequestsSize, ui32, 32_MB )\ + xxx(DirectoryHandlesStoragePath, TString, "" )\ + xxx(DirectoryHandlesInitialDataSize, ui64, 1_GB )\ // VHOST_SERVICE_CONFIG #define VHOST_SERVICE_DECLARE_CONFIG(name, type, value) \ diff --git a/cloud/filestore/libs/endpoint_vhost/config.h b/cloud/filestore/libs/endpoint_vhost/config.h index 39655da2be6..b19d9db9843 100644 --- a/cloud/filestore/libs/endpoint_vhost/config.h +++ b/cloud/filestore/libs/endpoint_vhost/config.h @@ -42,6 +42,9 @@ class TVhostServiceConfig ui32 GetWriteBackCacheFlushMaxWriteRequestsCount() const; ui32 GetWriteBackCacheFlushMaxSumWriteRequestsSize() const; + TString GetDirectoryHandlesStoragePath() const; + ui64 GetDirectoryHandlesInitialDataSize() const; + void Dump(IOutputStream& out) const; void DumpHtml(IOutputStream& out) const; }; diff --git a/cloud/filestore/libs/endpoint_vhost/listener.cpp b/cloud/filestore/libs/endpoint_vhost/listener.cpp index 2c47bb36254..57887467093 100644 --- a/cloud/filestore/libs/endpoint_vhost/listener.cpp +++ b/cloud/filestore/libs/endpoint_vhost/listener.cpp @@ -67,6 +67,7 @@ class TEndpointListener final const IFileSystemLoopFactoryPtr LoopFactory; const THandleOpsQueueConfig HandleOpsQueueConfig; const TWriteBackCacheConfig WriteBackCacheConfig; + const TDirectoryHandlesStorageConfig DirectoryHandlesStorageConfig; TLog Log; @@ -78,7 +79,8 @@ class TEndpointListener final IFileStoreEndpointsPtr filestoreEndpoints, IFileSystemLoopFactoryPtr loopFactory, THandleOpsQueueConfig handleOpsQueueConfig, - TWriteBackCacheConfig writeBackCacheConfig) + TWriteBackCacheConfig writeBackCacheConfig, + TDirectoryHandlesStorageConfig directoryHandlesStorageConfig) : Logging(std::move(logging)) , Timer(std::move(timer)) , Scheduler(std::move(scheduler)) @@ -86,6 +88,7 @@ class TEndpointListener final , LoopFactory(std::move(loopFactory)) , HandleOpsQueueConfig(std::move(handleOpsQueueConfig)) , WriteBackCacheConfig(std::move(writeBackCacheConfig)) + , DirectoryHandlesStorageConfig(std::move(directoryHandlesStorageConfig)) { Log = Logging->CreateLog("NFS_VHOST"); } @@ -137,6 +140,8 @@ class TEndpointListener final WriteBackCacheConfig.FlushMaxWriteRequestsCount); protoConfig.SetWriteBackCacheFlushMaxSumWriteRequestsSize( WriteBackCacheConfig.FlushMaxSumWriteRequestsSize); + protoConfig.SetDirectoryHandlesStoragePath(DirectoryHandlesStorageConfig.PathPrefix); + protoConfig.SetDirectoryHandlesInitialDataSize(DirectoryHandlesStorageConfig.InitialDataSize); auto vFSConfig = std::make_shared(std::move(protoConfig)); auto Loop = LoopFactory->Create( @@ -158,7 +163,8 @@ IEndpointListenerPtr CreateEndpointListener( IFileStoreEndpointsPtr filestoreEndpoints, IFileSystemLoopFactoryPtr loopFactory, THandleOpsQueueConfig handleOpsQueueConfig, - TWriteBackCacheConfig writeBackCacheConfig) + TWriteBackCacheConfig writeBackCacheConfig, + TDirectoryHandlesStorageConfig directoryHandlesStorageConfig) { return std::make_shared( std::move(logging), @@ -167,7 +173,8 @@ IEndpointListenerPtr CreateEndpointListener( std::move(filestoreEndpoints), std::move(loopFactory), std::move(handleOpsQueueConfig), - std::move(writeBackCacheConfig)); + std::move(writeBackCacheConfig), + std::move(directoryHandlesStorageConfig)); } } // namespace NCloud::NFileStore::NVhost diff --git a/cloud/filestore/libs/endpoint_vhost/listener.h b/cloud/filestore/libs/endpoint_vhost/listener.h index c7973df34d3..de995c783a9 100644 --- a/cloud/filestore/libs/endpoint_vhost/listener.h +++ b/cloud/filestore/libs/endpoint_vhost/listener.h @@ -35,6 +35,14 @@ struct TWriteBackCacheConfig //////////////////////////////////////////////////////////////////////////////// +struct TDirectoryHandlesStorageConfig +{ + TString PathPrefix; + ui64 InitialDataSize = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// + IEndpointListenerPtr CreateEndpointListener( ILoggingServicePtr logging, ITimerPtr timer, @@ -42,6 +50,7 @@ IEndpointListenerPtr CreateEndpointListener( IFileStoreEndpointsPtr filestoreEndpoints, NVFS::IFileSystemLoopFactoryPtr loopFactory, THandleOpsQueueConfig handleOpsQueueConfig, - TWriteBackCacheConfig writeBackCacheConfig); + TWriteBackCacheConfig writeBackCacheConfig, + TDirectoryHandlesStorageConfig directoryHandlesStorageConfig); } // namespace NCloud::NFileStore::NVhost diff --git a/cloud/filestore/libs/service_local/config.cpp b/cloud/filestore/libs/service_local/config.cpp index ed16c88a36b..dd71c499f2d 100644 --- a/cloud/filestore/libs/service_local/config.cpp +++ b/cloud/filestore/libs/service_local/config.cpp @@ -42,6 +42,8 @@ namespace { xxx(MaxFuseLoopThreads, ui32, 1 )\ xxx(ZeroCopyWriteEnabled, bool, false )\ xxx(FSyncQueueDisabled, bool, false )\ + xxx(DirectoryHandlesStorageEnabled, bool, false )\ + xxx(DirectoryHandlesTableSize, ui64, 100000 )\ // FILESTORE_SERVICE_CONFIG #define FILESTORE_SERVICE_NULL_FILE_IO_CONFIG(xxx) \ diff --git a/cloud/filestore/libs/service_local/config.h b/cloud/filestore/libs/service_local/config.h index a55e9248cae..5cd59136990 100644 --- a/cloud/filestore/libs/service_local/config.h +++ b/cloud/filestore/libs/service_local/config.h @@ -116,6 +116,10 @@ class TLocalFileStoreConfig bool GetZeroCopyWriteEnabled() const; bool GetFSyncQueueDisabled() const; + + bool GetDirectoryHandlesStorageEnabled() const; + + ui64 GetDirectoryHandlesTableSize() const; }; } // namespace NCloud::NFileStore diff --git a/cloud/filestore/libs/service_local/fs_session.cpp b/cloud/filestore/libs/service_local/fs_session.cpp index 291da4d1f52..17a1df61aeb 100644 --- a/cloud/filestore/libs/service_local/fs_session.cpp +++ b/cloud/filestore/libs/service_local/fs_session.cpp @@ -37,15 +37,21 @@ NProto::TCreateSessionResponse TLocalFileSystem::CreateSession( features->SetAsyncHandleOperationPeriod( Config->GetAsyncHandleOperationPeriod().MilliSeconds()); features->SetZeroCopyEnabled(Config->GetZeroCopyEnabled()); - features->SetGuestPageCacheDisabled(Config->GetGuestPageCacheDisabled()); - features->SetExtendedAttributesDisabled(Config->GetExtendedAttributesDisabled()); + features->SetGuestPageCacheDisabled( + Config->GetGuestPageCacheDisabled()); + features->SetExtendedAttributesDisabled( + Config->GetExtendedAttributesDisabled()); features->SetServerWriteBackCacheEnabled( Config->GetServerWriteBackCacheEnabled()); - features->SetMaxBackground( - Config->GetMaxBackground()); - features->SetMaxFuseLoopThreads( - Config->GetMaxFuseLoopThreads()); + features->SetMaxBackground(Config->GetMaxBackground()); + features->SetMaxFuseLoopThreads(Config->GetMaxFuseLoopThreads()); features->SetFSyncQueueDisabled(Config->GetFSyncQueueDisabled()); + features->SetDirectoryHandlesStorageEnabled( + Config->GetDirectoryHandlesStorageEnabled()); + if (Config->GetDirectoryHandlesStorageEnabled()) { + features->SetDirectoryHandlesTableSize( + Config->GetDirectoryHandlesTableSize()); + } return response; }; diff --git a/cloud/filestore/libs/storage/core/config.cpp b/cloud/filestore/libs/storage/core/config.cpp index bdaa6f3a166..7caaab896eb 100644 --- a/cloud/filestore/libs/storage/core/config.cpp +++ b/cloud/filestore/libs/storage/core/config.cpp @@ -295,6 +295,9 @@ using TAliases = NProto::TStorageConfig::TFilestoreAliases; xxx(ZeroCopyWriteEnabled, bool, false )\ \ xxx(FSyncQueueDisabled, bool, false )\ + \ + xxx(DirectoryHandlesStorageEnabled, bool, false )\ + xxx(DirectoryHandlesTableSize, ui64, 100'000 )\ // FILESTORE_STORAGE_CONFIG #define FILESTORE_STORAGE_CONFIG_REF(xxx) \ diff --git a/cloud/filestore/libs/storage/core/config.h b/cloud/filestore/libs/storage/core/config.h index 7968d67b416..4a27e73cde2 100644 --- a/cloud/filestore/libs/storage/core/config.h +++ b/cloud/filestore/libs/storage/core/config.h @@ -345,6 +345,9 @@ class TStorageConfig bool GetZeroCopyWriteEnabled() const; bool GetFSyncQueueDisabled() const; + + bool GetDirectoryHandlesStorageEnabled() const; + ui64 GetDirectoryHandlesTableSize() const; }; } // namespace NCloud::NFileStore::NStorage diff --git a/cloud/filestore/libs/storage/tablet/tablet_actor_createsession.cpp b/cloud/filestore/libs/storage/tablet/tablet_actor_createsession.cpp index ebb8db92407..4dcec34467c 100644 --- a/cloud/filestore/libs/storage/tablet/tablet_actor_createsession.cpp +++ b/cloud/filestore/libs/storage/tablet/tablet_actor_createsession.cpp @@ -61,6 +61,14 @@ void FillFeatures( features->SetParentlessFilesOnly(config.GetParentlessFilesOnly()); features->SetAllowHandlelessIO(config.GetAllowHandlelessIO()); + features->SetDirectoryHandlesStorageEnabled( + config.GetDirectoryHandlesStorageEnabled()); + + if (config.GetDirectoryHandlesStorageEnabled()) { + features->SetDirectoryHandlesTableSize( + config.GetDirectoryHandlesTableSize()); + } + features->SetDirectoryCreationInShardsEnabled( fileSystem.GetDirectoryCreationInShardsEnabled()); diff --git a/cloud/filestore/libs/vfs/config.cpp b/cloud/filestore/libs/vfs/config.cpp index c1cf77f08c8..ce2e76a393b 100644 --- a/cloud/filestore/libs/vfs/config.cpp +++ b/cloud/filestore/libs/vfs/config.cpp @@ -38,6 +38,9 @@ namespace { xxx(WriteBackCacheFlushMaxWriteRequestSize, ui32, 1_MB )\ xxx(WriteBackCacheFlushMaxWriteRequestsCount, ui32, 64 )\ xxx(WriteBackCacheFlushMaxSumWriteRequestsSize, ui32, 32_MB )\ + \ + xxx(DirectoryHandlesStoragePath, TString, "" )\ + xxx(DirectoryHandlesInitialDataSize, ui64, 1_GB )\ // FILESTORE_VFS_CONFIG #define FILESTORE_VFS_DECLARE_CONFIG(name, type, value) \ diff --git a/cloud/filestore/libs/vfs/config.h b/cloud/filestore/libs/vfs/config.h index fc3ad2d4ee3..2619b0aaba9 100644 --- a/cloud/filestore/libs/vfs/config.h +++ b/cloud/filestore/libs/vfs/config.h @@ -51,6 +51,9 @@ struct TVFSConfig ui32 GetWriteBackCacheFlushMaxWriteRequestsCount() const; ui32 GetWriteBackCacheFlushMaxSumWriteRequestsSize() const; + TString GetDirectoryHandlesStoragePath() const; + ui64 GetDirectoryHandlesInitialDataSize() const; + bool GetGuestKeepCacheAllowed() const; void Dump(IOutputStream& out) const; diff --git a/cloud/filestore/libs/vfs_fuse/config.cpp b/cloud/filestore/libs/vfs_fuse/config.cpp index dd53379ac04..e30bd557d82 100644 --- a/cloud/filestore/libs/vfs_fuse/config.cpp +++ b/cloud/filestore/libs/vfs_fuse/config.cpp @@ -41,6 +41,10 @@ namespace { \ xxx(ServerWriteBackCacheEnabled, bool, false )\ \ + xxx(DirectoryHandlesStorageEnabled, bool, false )\ + \ + xxx(DirectoryHandlesTableSize, ui64, 100'000 )\ + \ xxx(GuestKeepCacheAllowed, bool, false )\ xxx(MaxBackground, ui32, 0 )\ xxx(MaxFuseLoopThreads, ui32, 1 )\ diff --git a/cloud/filestore/libs/vfs_fuse/config.h b/cloud/filestore/libs/vfs_fuse/config.h index 1a187f50ea9..21592c0f782 100644 --- a/cloud/filestore/libs/vfs_fuse/config.h +++ b/cloud/filestore/libs/vfs_fuse/config.h @@ -51,6 +51,10 @@ struct TFileSystemConfig bool GetServerWriteBackCacheEnabled() const; + bool GetDirectoryHandlesStorageEnabled() const; + + ui64 GetDirectoryHandlesTableSize() const; + bool GetGuestKeepCacheAllowed() const; ui32 GetMaxBackground() const; diff --git a/cloud/filestore/libs/vfs_fuse/directory_handles_storage.cpp b/cloud/filestore/libs/vfs_fuse/directory_handles_storage.cpp new file mode 100644 index 00000000000..44c76a2a987 --- /dev/null +++ b/cloud/filestore/libs/vfs_fuse/directory_handles_storage.cpp @@ -0,0 +1,274 @@ +#include "directory_handles_storage.h" + +#include + +#include + +namespace NCloud::NFileStore::NFuse { + +//////////////////////////////////////////////////////////////////////////////// + +TDirectoryHandlesStorage::TDirectoryHandlesStorage( + TLog& log, + const TString& filePath, + ui64 recordsCount, + ui64 initialDataAreaSize, + ui64 initialDataCompactionBufferSize) + : Log(log) +{ + Table = std::make_unique( + filePath, + recordsCount, + initialDataAreaSize, + initialDataCompactionBufferSize, + 30); +} + +void TDirectoryHandlesStorage::StoreHandle( + ui64 handleId, + const TDirectoryHandleChunk& initialHandleChunk) +{ + if (HandleIdToIndices.contains(handleId)) { + ReportDirectoryHandlesStorageError( + "Failed to store record with existing handle id"); + return; + } + + TBuffer record = SerializeHandle(handleId, initialHandleChunk); + + TGuard guard(TableLock); + + CreateRecord(handleId, record); +} + +void TDirectoryHandlesStorage::UpdateHandle( + ui64 handleId, + const TDirectoryHandleChunk& handleChunk) +{ + TBuffer record = SerializeHandle(handleId, handleChunk); + + TGuard guard(TableLock); + + // update can be called when handle already deleted, in this case just + // log info and return + if (!HandleIdToIndices.contains(handleId)) { + STORAGE_DEBUG( + "failed to update record for handle %lu, handle is already deleted", + handleId); + return; + } + + CreateRecord(handleId, record); +} + +void TDirectoryHandlesStorage::CreateRecord( + ui64 handleId, + const TBuffer& record) +{ + ui64 recordIndex = CreateRecord(record); + + if (recordIndex == TDirectoryHandleTable::InvalidIndex) { + ReportDirectoryHandlesStorageError( + "Failed to create record for directory handle chunk"); + return; + } + + HandleIdToIndices[handleId].push_back(recordIndex); +} + +void TDirectoryHandlesStorage::RemoveHandle(ui64 handleId) +{ + TGuard guard(TableLock); + if (HandleIdToIndices.contains(handleId)) { + for (auto recordIndex: HandleIdToIndices[handleId]) { + if (!Table->DeleteRecord(recordIndex)) { + STORAGE_DEBUG( + "failed to delete record for handle %lu using index %lu", + handleId, + recordIndex); + } + } + } + HandleIdToIndices.erase(handleId); +} + +void TDirectoryHandlesStorage::ResetHandle(ui64 handleId) +{ + TGuard guard(TableLock); + if (HandleIdToIndices.contains(handleId)) { + for (auto it = std::next(HandleIdToIndices[handleId].begin(), 1); + it != HandleIdToIndices[handleId].end(); + ++it) + { + if (!Table->DeleteRecord(*it)) { + STORAGE_DEBUG( + "failed to delete record for handle %lu using index %lu", + handleId, + *it); + } + } + HandleIdToIndices[handleId].erase( + std::next(HandleIdToIndices[handleId].begin(), 1), + HandleIdToIndices[handleId].end()); + } +} + +void TDirectoryHandlesStorage::LoadHandles(TDirectoryHandleMap& handles) +{ + // Since we store data in chunks instead of a single block, in rare cases + // a crash during the reset or removal process can lead to inconsistent + // chunks order. We detect this inconsistency during the load phase and fix + // it. + struct TUpdateVersionInfo + { + ui64 LargestUpdateVersion = 0; + ui64 ChunksCount = 0; + bool IsFirstChunkPresented = false; + }; + + TMap updateVersionInfo; + + { + TGuard guard(TableLock); + + for (auto it = Table->begin(); it != Table->end(); ++it) { + TStringBuf record = *it; + if (record.empty()) { + STORAGE_TRACE( + "bad record from storage during load directory handles"); + continue; + } + + auto [handleId, chunk] = DeserializeHandleChunk(record); + + if (!handleId) { + STORAGE_DEBUG( + "bad deserialize for record %lu from storage during load " + "directory handles", + it.GetIndex()); + continue; + } + + if (!handles.contains(handleId)) { + handles[handleId] = + std::make_shared(chunk->Index); + } + + handles[handleId]->ConsumeChunk(*chunk); + + updateVersionInfo[handleId].ChunksCount++; + if (chunk->UpdateVersion > + updateVersionInfo[handleId].LargestUpdateVersion) + { + updateVersionInfo[handleId].LargestUpdateVersion = + chunk->UpdateVersion; + } + + // Always store the first chunk at the beginning of the list — this + // helps us handle the reset logic correctly and efficiently. + if (chunk->UpdateVersion == 0) { + updateVersionInfo[handleId].IsFirstChunkPresented = true; + HandleIdToIndices[handleId].insert( + HandleIdToIndices[handleId].begin(), + it.GetIndex()); + } else { + HandleIdToIndices[handleId].push_back(it.GetIndex()); + } + } + } + + for (auto [handleId, updateVersionInfo]: updateVersionInfo) { + if (!updateVersionInfo.IsFirstChunkPresented) { + ReportDirectoryHandlesStorageError( + TStringBuilder() + << "First chunk for handle " << handleId << " is missing"); + + RemoveHandle(handleId); + continue; + } + + if (updateVersionInfo.ChunksCount != + updateVersionInfo.LargestUpdateVersion + 1) + { + ReportDirectoryHandlesStorageError( + TStringBuilder() + << "Total chunks count " << updateVersionInfo.ChunksCount + << " is not equal to largest update version " + << updateVersionInfo.LargestUpdateVersion << " + 1"); + ResetHandle(handleId); + continue; + } + } +} + +void TDirectoryHandlesStorage::Clear() +{ + TGuard guard(TableLock); + Table->Clear(); + HandleIdToIndices.clear(); +} + +TBuffer TDirectoryHandlesStorage::SerializeHandle( + ui64 handleId, + const TDirectoryHandleChunk& handleChunk) const +{ + TBuffer buffer; + + TBufferOutput output(buffer); + output.Write(&handleId, sizeof(handleId)); + handleChunk.Serialize(output); + + return buffer; +} + +TDirectoryHandleChunkPair TDirectoryHandlesStorage::DeserializeHandleChunk( + const TStringBuf& buffer) +{ + TMemoryInput input(buffer); + ui64 handleId; + if (input.Load(&handleId, sizeof(handleId)) != sizeof(handleId)) { + return {0, std::nullopt}; + } + + auto chunk = TDirectoryHandleChunk::Deserialize(input); + if (!chunk) { + return {0, std::nullopt}; + } + + return {handleId, chunk}; +} + +ui64 TDirectoryHandlesStorage::CreateRecord(const TBuffer& record) +{ + ui64 index = Table->AllocRecord(record.Size()); + if (index == TDirectoryHandleTable::InvalidIndex) { + return TDirectoryHandleTable::InvalidIndex; + } + + if (!Table->WriteRecordData(index, record.Data(), record.Size())) { + return TDirectoryHandleTable::InvalidIndex; + } + + Table->CommitRecord(index); + + return index; +} + +//////////////////////////////////////////////////////////////////////////////// + +TDirectoryHandlesStoragePtr CreateDirectoryHandlesStorage( + TLog& log, + const TString& filePath, + ui64 recordsCount, + ui64 initialDataAreaSize, + ui64 initialDataCompactionBufferSize) +{ + return std::make_unique( + log, + filePath, + recordsCount, + initialDataAreaSize, + initialDataCompactionBufferSize); +} + +} // namespace NCloud::NFileStore::NFuse diff --git a/cloud/filestore/libs/vfs_fuse/directory_handles_storage.h b/cloud/filestore/libs/vfs_fuse/directory_handles_storage.h new file mode 100644 index 00000000000..eeddba44266 --- /dev/null +++ b/cloud/filestore/libs/vfs_fuse/directory_handles_storage.h @@ -0,0 +1,81 @@ +#pragma once + +#include "public.h" + +#include "fs_directory_handle.h" + +#include +#include +#include + +#include +#include +#include +#include + +namespace NCloud::NFileStore::NFuse { + +//////////////////////////////////////////////////////////////////////////////// + +class TDirectoryHandle; + +using TDirectoryHandleMap = THashMap>; +using TDirectoryHandleChunkPair = + std::pair>; + +//////////////////////////////////////////////////////////////////////////////// + +class TDirectoryHandlesStorage +{ +private: + struct TDirectoryHandleTableHeader + { + }; + + using TDirectoryHandleTable = + TDynamicPersistentTable; + + TLog Log; + + // Mutex for Table and HandleIdToIndices + TMutex TableLock; + std::unique_ptr Table; + THashMap> HandleIdToIndices; + +public: + explicit TDirectoryHandlesStorage( + TLog& log, + const TString& filePath, + ui64 recordsCount, + ui64 initialDataAreaSize, + ui64 initialDataCompactionBufferSize); + + void StoreHandle( + ui64 handleId, + const TDirectoryHandleChunk& initialHandleChunk); + void UpdateHandle(ui64 handleId, const TDirectoryHandleChunk& handleChunk); + void RemoveHandle(ui64 handleId); + void ResetHandle(ui64 handleId); + void LoadHandles(TDirectoryHandleMap& handles); + void Clear(); + +private: + TBuffer SerializeHandle( + ui64 handleId, + const TDirectoryHandleChunk& handleChunk) const; + TDirectoryHandleChunkPair DeserializeHandleChunk(const TStringBuf& buffer); + + ui64 CreateRecord(const TBuffer& record); + void CreateRecord(ui64 handleId, const TBuffer& record); +}; + +//////////////////////////////////////////////////////////////////////////////// + +TDirectoryHandlesStoragePtr CreateDirectoryHandlesStorage( + TLog& log, + const TString& filePath, + ui64 recordsCount, + ui64 initialDataAreaSize, + ui64 initialDataCompactionBufferSize); + +} // namespace NCloud::NFileStore::NFuse diff --git a/cloud/filestore/libs/vfs_fuse/fs.cpp b/cloud/filestore/libs/vfs_fuse/fs.cpp index a1f6673c339..7984a647b24 100644 --- a/cloud/filestore/libs/vfs_fuse/fs.cpp +++ b/cloud/filestore/libs/vfs_fuse/fs.cpp @@ -448,6 +448,7 @@ IFileSystemPtr CreateFileSystem( IRequestStatsPtr stats, ICompletionQueuePtr queue, THandleOpsQueuePtr handleOpsQueue, + TDirectoryHandlesStoragePtr directoryHandlesStorage, TWriteBackCache writeBackCache) { return std::make_shared( @@ -460,6 +461,7 @@ IFileSystemPtr CreateFileSystem( std::move(stats), std::move(queue), std::move(handleOpsQueue), + std::move(directoryHandlesStorage), std::move(writeBackCache)); } diff --git a/cloud/filestore/libs/vfs_fuse/fs.h b/cloud/filestore/libs/vfs_fuse/fs.h index 6cced440589..6d125669327 100644 --- a/cloud/filestore/libs/vfs_fuse/fs.h +++ b/cloud/filestore/libs/vfs_fuse/fs.h @@ -476,6 +476,7 @@ IFileSystemPtr CreateFileSystem( IRequestStatsPtr stats, ICompletionQueuePtr queue, THandleOpsQueuePtr handleOpsQueue, + TDirectoryHandlesStoragePtr directoryHandlesStorage, TWriteBackCache writeBackCache); } // namespace NCloud::NFileStore::NFuse diff --git a/cloud/filestore/libs/vfs_fuse/fs_directory_handle.cpp b/cloud/filestore/libs/vfs_fuse/fs_directory_handle.cpp new file mode 100644 index 00000000000..295941e7e46 --- /dev/null +++ b/cloud/filestore/libs/vfs_fuse/fs_directory_handle.cpp @@ -0,0 +1,180 @@ +#include "fs_directory_handle.h" + +namespace NCloud::NFileStore::NFuse { + +void TDirectoryHandleChunk::Serialize(TBufferOutput& output) const +{ + output.Write(&Index, sizeof(Index)); + output.Write(&UpdateVersion, sizeof(UpdateVersion)); + + ui32 cookieLen = Cookie.size(); + output.Write(&cookieLen, sizeof(cookieLen)); + if (cookieLen > 0) { + output.Write(Cookie.data(), cookieLen); + } + + if (!Key) { + return; + } + + ui64 keyValue = Key.value(); + + output.Write(&keyValue, sizeof(keyValue)); + ui32 bufferSize = DirectoryContent.Content != nullptr + ? DirectoryContent.Content->Size() + : 0; + output.Write(&bufferSize, sizeof(bufferSize)); + if (bufferSize > 0) { + output.Write(DirectoryContent.Content->Data(), bufferSize); + } +} + +std::optional TDirectoryHandleChunk::Deserialize( + TMemoryInput& input) +{ + if (input.Avail() < sizeof(fuse_ino_t)) { + return std::nullopt; + } + + TDirectoryHandleChunk chunk; + + if (input.Load(&chunk.Index, sizeof(chunk.Index)) != sizeof(chunk.Index)) { + return std::nullopt; + } + + if (input.Load(&chunk.UpdateVersion, sizeof(chunk.UpdateVersion)) != + sizeof(chunk.UpdateVersion)) + { + return std::nullopt; + } + + ui32 cookieLen = 0; + if (input.Load(&cookieLen, sizeof(cookieLen)) != sizeof(cookieLen)) { + return std::nullopt; + } + + if (cookieLen > 0) { + if (input.Avail() < cookieLen) { + return std::nullopt; + } + chunk.Cookie.resize(cookieLen); + if (input.Load(chunk.Cookie.begin(), cookieLen) != cookieLen) { + return std::nullopt; + } + } + + if (input.Avail() > 0) { + ui64 key = 0; + if (input.Load(&key, sizeof(key)) != sizeof(key)) { + return std::nullopt; + } + chunk.Key = key; + + ui32 bufferSize = 0; + if (input.Load(&bufferSize, sizeof(bufferSize)) != sizeof(bufferSize)) { + return std::nullopt; + } + + if (bufferSize > 0) { + if (input.Avail() < bufferSize) { + return std::nullopt; + } + chunk.DirectoryContent.Content = std::make_shared(); + chunk.DirectoryContent.Content->Resize(bufferSize); + if (input.Load( + chunk.DirectoryContent.Content->Data(), + bufferSize) != bufferSize) + { + return std::nullopt; + } + } + } + + return chunk; +} + +TDirectoryHandleChunk TDirectoryHandle::UpdateContent( + size_t size, + size_t offset, + const TBufferPtr& content, + TString cookie) +{ + size_t end = offset + content->size(); + TDirectoryHandleChunk chunk{ + .Key = end, + .Index = Index, + .DirectoryContent = {content, 0, size}}; + + with_lock (Lock) { + Y_ABORT_UNLESS(Content.upper_bound(end) == Content.end()); + Content[end] = content; + Cookie = std::move(cookie); + chunk.Cookie = Cookie; + chunk.UpdateVersion = ++UpdateVersion; + } + + return chunk; +} + +TMaybe +TDirectoryHandle::ReadContent(size_t size, size_t offset, TLog& Log) +{ + size_t end = 0; + TBufferPtr content = nullptr; + + with_lock (Lock) { + auto it = Content.upper_bound(offset); + if (it != Content.end()) { + end = it->first; + content = it->second; + } else if (Cookie) { + return Nothing(); + } + } + + TDirectoryContent result; + if (content) { + offset = offset - (end - content->size()); + if (offset >= content->size()) { + STORAGE_ERROR("off %lu size %lu", offset, content->size()); + return Nothing(); + } + result = {content, offset, size}; + } + + return result; +} + +void TDirectoryHandle::ResetContent() +{ + with_lock (Lock) { + Content.clear(); + Cookie.clear(); + UpdateVersion = 0; + } +} + +TString TDirectoryHandle::GetCookie() +{ + with_lock (Lock) { + return Cookie; + } +} + +void TDirectoryHandle::ConsumeChunk(TDirectoryHandleChunk& chunk) +{ + Y_ABORT_UNLESS(Index == chunk.Index); + + if (chunk.UpdateVersion > UpdateVersion) { + UpdateVersion = chunk.UpdateVersion; + Cookie = std::move(chunk.Cookie); + } + + if (chunk.Key) { + Content.emplace( + chunk.Key.value(), + std::move(chunk.DirectoryContent.Content)); + } +} + +} // namespace NCloud::NFileStore::NFuse diff --git a/cloud/filestore/libs/vfs_fuse/fs_directory_handle.h b/cloud/filestore/libs/vfs_fuse/fs_directory_handle.h new file mode 100644 index 00000000000..04641c918ff --- /dev/null +++ b/cloud/filestore/libs/vfs_fuse/fs_directory_handle.h @@ -0,0 +1,86 @@ +#pragma once + +#include "public.h" + +#include "fs.h" + +#include + +#include +#include +#include +#include +#include + +namespace NCloud::NFileStore::NFuse { + +using namespace NCloud::NFileStore::NVFS; + +using TBufferPtr = std::shared_ptr; + +struct TDirectoryContent +{ + TBufferPtr Content = nullptr; + size_t Offset = 0; + size_t Size = 0; + + const char* GetData() const + { + return Content ? Content->Data() + Offset : nullptr; + } + + size_t GetSize() const + { + return Content ? Min(Size, Content->Size() - Offset) : 0; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +struct TDirectoryHandleChunk +{ + std::optional Key; + ui64 UpdateVersion = 0; + ui64 Index = 0; + TString Cookie; + TDirectoryContent DirectoryContent; + + void Serialize(TBufferOutput& output) const; + static std::optional Deserialize( + TMemoryInput& input); +}; + +//////////////////////////////////////////////////////////////////////////////// + +class TDirectoryHandle +{ +private: + TString Cookie; + TMap Content; + ui64 UpdateVersion = 0; + + TMutex Lock; + +public: + const fuse_ino_t Index; + + explicit TDirectoryHandle(fuse_ino_t ino) + : Index(ino) + {} + + TDirectoryHandleChunk UpdateContent( + size_t size, + size_t offset, + const TBufferPtr& content, + TString cookie); + + TMaybe + ReadContent(size_t size, size_t offset, TLog& Log); + void ResetContent(); + TString GetCookie(); + + // not thread safe, use only during restoration from storage + void ConsumeChunk(TDirectoryHandleChunk& chunk); +}; + +} // namespace NCloud::NFileStore::NFuse diff --git a/cloud/filestore/libs/vfs_fuse/fs_impl.cpp b/cloud/filestore/libs/vfs_fuse/fs_impl.cpp index 50d8837f81f..678cf760e16 100644 --- a/cloud/filestore/libs/vfs_fuse/fs_impl.cpp +++ b/cloud/filestore/libs/vfs_fuse/fs_impl.cpp @@ -34,6 +34,7 @@ TFileSystem::TFileSystem( IRequestStatsPtr stats, ICompletionQueuePtr queue, THandleOpsQueuePtr handleOpsQueue, + TDirectoryHandlesStoragePtr directoryHandlesStorage, TWriteBackCache writeBackCache) : Logging(std::move(logging)) , ProfileLog(std::move(profileLog)) @@ -48,6 +49,7 @@ TFileSystem::TFileSystem( Config->GetXAttrCacheLimit(), Config->GetXAttrCacheTimeout()) , HandleOpsQueue(std::move(handleOpsQueue)) + , DirectoryHandlesStorage(std::move(directoryHandlesStorage)) , WriteBackCache(std::move(writeBackCache)) { Log = Logging->CreateLog("NFS_FUSE"); @@ -57,11 +59,15 @@ TFileSystem::TFileSystem( FSyncQueue = std::make_unique(Config->GetFileSystemId(), Logging); } + + if (DirectoryHandlesStorage) { + DirectoryHandlesStorage->LoadHandles(DirectoryHandles); + } } TFileSystem::~TFileSystem() { - Reset(); + ClearDirectoryCache(); } void TFileSystem::Init() @@ -73,7 +79,15 @@ void TFileSystem::Init() void TFileSystem::Reset() { STORAGE_INFO("resetting filesystem cache"); - ClearDirectoryCache(); + with_lock (DirectoryHandlesLock) { + STORAGE_DEBUG("clear directory cache of size %lu", + DirectoryHandles.size()); + DirectoryHandles.clear(); + + if (DirectoryHandlesStorage) { + DirectoryHandlesStorage->Clear(); + } + } } void TFileSystem::ScheduleProcessHandleOpsQueue() diff --git a/cloud/filestore/libs/vfs_fuse/fs_impl.h b/cloud/filestore/libs/vfs_fuse/fs_impl.h index 83986713d9f..25bd97940bc 100644 --- a/cloud/filestore/libs/vfs_fuse/fs_impl.h +++ b/cloud/filestore/libs/vfs_fuse/fs_impl.h @@ -3,6 +3,7 @@ #include "public.h" #include "config.h" +#include "directory_handles_storage.h" #include "fs.h" #include "handle_ops_queue.h" #include "node_cache.h" @@ -89,6 +90,8 @@ class TFileSystem final THandleOpsQueuePtr HandleOpsQueue; TMutex HandleOpsQueueLock; + TDirectoryHandlesStoragePtr DirectoryHandlesStorage; + TQueue DelayedReleaseQueue; TMutex DelayedReleaseQueueLock; @@ -105,6 +108,7 @@ class TFileSystem final IRequestStatsPtr stats, ICompletionQueuePtr queue, THandleOpsQueuePtr handleOpsQueue, + TDirectoryHandlesStoragePtr directoryHandlesStorage, TWriteBackCache writeBackCache); ~TFileSystem(); diff --git a/cloud/filestore/libs/vfs_fuse/fs_impl_list.cpp b/cloud/filestore/libs/vfs_fuse/fs_impl_list.cpp index cfb0d1a8935..e32d9e1c15a 100644 --- a/cloud/filestore/libs/vfs_fuse/fs_impl_list.cpp +++ b/cloud/filestore/libs/vfs_fuse/fs_impl_list.cpp @@ -1,5 +1,7 @@ #include "fs_impl.h" +#include "fs_directory_handle.h" + #include #include #include @@ -11,108 +13,6 @@ namespace NCloud::NFileStore::NFuse { using namespace NCloud::NFileStore::NVFS; -namespace { - -//////////////////////////////////////////////////////////////////////////////// - -using TBufferPtr = std::shared_ptr; - -//////////////////////////////////////////////////////////////////////////////// - -struct TDirectoryContent -{ - TBufferPtr Content = nullptr; - size_t Offset = 0; - size_t Size = 0; - - const char* GetData() const - { - return Content ? Content->Data() + Offset : nullptr; - } - - size_t GetSize() const - { - return Content ? Min(Size, Content->Size() - Offset) : 0; - } -}; - -} // namespace - -//////////////////////////////////////////////////////////////////////////////// - -class TDirectoryHandle -{ -private: - TMap Content; - TMutex Lock; - -public: - const fuse_ino_t Index; - TString Cookie; - - explicit TDirectoryHandle(fuse_ino_t ino) - : Index(ino) - {} - - TDirectoryContent UpdateContent( - size_t size, - size_t offset, - const TBufferPtr& content, - TString cookie) - { - with_lock (Lock) { - size_t end = offset + content->size(); - Y_ABORT_UNLESS(Content.upper_bound(end) == Content.end()); - Content[end] = content; - Cookie = std::move(cookie); - } - - return TDirectoryContent{content, 0, size}; - } - - TMaybe ReadContent(size_t size, size_t offset, TLog& Log) - { - size_t end = 0; - TBufferPtr content = nullptr; - - with_lock (Lock) { - auto it = Content.upper_bound(offset); - if (it != Content.end()) { - end = it->first; - content = it->second; - } else if (Cookie) { - return Nothing(); - } - } - - TDirectoryContent result; - if (content) { - offset = offset - (end - content->size()); - if (offset >= content->size()) { - STORAGE_ERROR("off %lu size %lu", offset, content->size()); - return Nothing(); - } - result = {content, offset, size}; - } - - return result; - } - - void ResetContent() - { - with_lock (Lock) { - Content.clear(); - Cookie.clear(); - } - } - - TString GetCookie() - { - with_lock (Lock) { - return Cookie; - } - } -}; namespace { @@ -237,6 +137,10 @@ void TFileSystem::OpenDir( do { id = RandomNumber(); } while (!DirectoryHandles.try_emplace(id, handle).second); + + if (DirectoryHandlesStorage) { + DirectoryHandlesStorage->StoreHandle(id, TDirectoryHandleChunk{.Index = ino}); + } } fuse_file_info info = {}; @@ -248,6 +152,9 @@ void TFileSystem::OpenDir( // syscall was interrupted with_lock (DirectoryHandlesLock) { DirectoryHandles.erase(id); + if (DirectoryHandlesStorage) { + DirectoryHandlesStorage->RemoveHandle(id); + } } } } @@ -262,7 +169,8 @@ void TFileSystem::ReadDir( { STORAGE_DEBUG("ReadDir #" << ino << " offset:" << offset - << " size:" << size); + << " size:" << size + << " fh:" << fi->fh); std::shared_ptr handle; with_lock (DirectoryHandlesLock) { @@ -298,6 +206,9 @@ void TFileSystem::ReadDir( if (!offset) { // directory contents need to be refreshed on rewinddir() handle->ResetContent(); + if (DirectoryHandlesStorage) { + DirectoryHandlesStorage->ResetHandle(fi->fh); + } } else if (auto content = handle->ReadContent(size, offset, Log)) { reply(*this, *content); return; @@ -308,74 +219,90 @@ void TFileSystem::ReadDir( request->SetMaxBytes(Config->GetMaxBufferSize()); Session->ListNodes(callContext, std::move(request)) - .Subscribe([=, ptr = weak_from_this()] (const auto& future) -> void { - auto self = ptr.lock(); - const auto& response = future.GetValue(); - if (!CheckResponse(self, *callContext, req, response)) { - return; - } - - if (response.NodesSize() != response.NamesSize()) { - STORAGE_ERROR("listnodes #" << fuse_req_unique(req) - << " names/nodes count mismatch"); - - self->ReplyError( - *callContext, - response.GetError(), - req, - EIO); - return; - } + .Subscribe( + [=, fh = fi->fh, ptr = weak_from_this()](const auto& future) -> void + { + auto self = ptr.lock(); + const auto& response = future.GetValue(); + if (!CheckResponse(self, *callContext, req, response)) { + return; + } - TDirectoryBuilder builder(size); - if (offset == 0) { - builder.Add(req, ".", { .attr = {.st_ino = MissingNodeId}}, offset); - builder.Add(req, "..", { .attr = {.st_ino = MissingNodeId}}, offset); - } + if (response.NodesSize() != response.NamesSize()) { + STORAGE_ERROR( + "listnodes #" << fuse_req_unique(req) + << " names/nodes count mismatch"); - for (size_t i = 0; i < response.NodesSize(); ++i) { - const auto& attr = response.GetNodes(i); - const auto& name = response.GetNames(i); - - fuse_entry_param entry = { - .ino = attr.GetId(), - .attr_timeout = Config->GetAttrTimeout().SecondsFloat(), - .entry_timeout = Config->GetEntryTimeout().SecondsFloat(), - }; - - ConvertAttr(Config->GetPreferredBlockSize(), attr, entry.attr); - if (!entry.attr.st_ino) { - const auto error = MakeError( - E_IO, - TStringBuilder() << "#" << fuse_req_unique(req) - << " listed invalid entry: parent " << ino << ", name " - << name.Quote() << ", stat " << DumpMessage(attr)); - - STORAGE_ERROR(error.GetMessage()); self->ReplyError( *callContext, - error, + response.GetError(), req, EIO); return; } - builder.Add(req, name, entry, offset); - } + TDirectoryBuilder builder(size); + if (offset == 0) { + builder.Add( + req, + ".", + {.attr = {.st_ino = MissingNodeId}}, + offset); + builder.Add( + req, + "..", + {.attr = {.st_ino = MissingNodeId}}, + offset); + } + + for (size_t i = 0; i < response.NodesSize(); ++i) { + const auto& attr = response.GetNodes(i); + const auto& name = response.GetNames(i); + + fuse_entry_param entry = { + .ino = attr.GetId(), + .attr_timeout = Config->GetAttrTimeout().SecondsFloat(), + .entry_timeout = + Config->GetEntryTimeout().SecondsFloat(), + }; + + ConvertAttr( + Config->GetPreferredBlockSize(), + attr, + entry.attr); + if (!entry.attr.st_ino) { + const auto error = MakeError( + E_IO, + TStringBuilder() << "#" << fuse_req_unique(req) + << " listed invalid entry: parent " + << ino << ", name " << name.Quote() + << ", stat " << DumpMessage(attr)); + + STORAGE_ERROR(error.GetMessage()); + self->ReplyError(*callContext, error, req, EIO); + return; + } + + builder.Add(req, name, entry, offset); + } + + auto handleChunk = handle->UpdateContent( + size, + offset, + builder.Finish(), + response.GetCookie()); - auto content = handle->UpdateContent( - size, - offset, - builder.Finish(), - response.GetCookie()); + STORAGE_TRACE( + "# " << fuse_req_unique(req) << " offset: " << offset + << " limit: " << size << " actual size " + << handleChunk.DirectoryContent.GetSize()); - STORAGE_TRACE("# " << fuse_req_unique(req) - << " offset: " << offset - << " limit: " << size - << " actual size " << content.GetSize()); + reply(*self, handleChunk.DirectoryContent); - reply(*self, content); - }); + if (DirectoryHandlesStorage) { + DirectoryHandlesStorage->UpdateHandle(fh, handleChunk); + } + }); } void TFileSystem::ReleaseDir( @@ -391,6 +318,10 @@ void TFileSystem::ReleaseDir( if (it != DirectoryHandles.end()) { CheckDirectoryHandle(req, ino, *it->second, Log, __func__); DirectoryHandles.erase(it); + + if (DirectoryHandlesStorage) { + DirectoryHandlesStorage->RemoveHandle(fi->fh); + } } } diff --git a/cloud/filestore/libs/vfs_fuse/fs_ut.cpp b/cloud/filestore/libs/vfs_fuse/fs_ut.cpp index 131f0739634..6e2fef24b7b 100644 --- a/cloud/filestore/libs/vfs_fuse/fs_ut.cpp +++ b/cloud/filestore/libs/vfs_fuse/fs_ut.cpp @@ -60,6 +60,7 @@ namespace { //////////////////////////////////////////////////////////////////////////////// constexpr TDuration WaitTimeout = TDuration::Seconds(5); +constexpr TDuration ExceptionWaitTimeout = TDuration::Seconds(1); static const TString FileSystemId = "fs1"; static const TString SessionId = CreateGuidAsString(); @@ -92,6 +93,8 @@ struct TBootstrap TPromise StopTriggered = NewPromise(); + TString DirectoryHandlesStoragePath; + TBootstrap( ITimerPtr timer = CreateWallClockTimer(), ISchedulerPtr scheduler = CreateScheduler(), @@ -163,6 +166,12 @@ struct TBootstrap proto.SetHandleOpsQueuePath(TempDir.Path() / "HandleOpsQueue"); proto.SetHandleOpsQueueSize(handleOpsQueueSize); } + + if (featuresConfig.GetDirectoryHandlesStorageEnabled()) { + proto.SetDirectoryHandlesStoragePath(TempDir.Path() / "DirectoryHandles"); + DirectoryHandlesStoragePath = proto.GetDirectoryHandlesStoragePath(); + } + if (featuresConfig.GetServerWriteBackCacheEnabled()) { proto.SetWriteBackCachePath(TempDir.Path() / "WriteBackCache"); // minimum possible capacity @@ -259,6 +268,12 @@ struct TBootstrap auto interrupt = Fuse->SendRequest(Fuse->GetLastRequestId() + 2); UNIT_ASSERT_NO_EXCEPTION(interrupt.GetValueSync()); }; + + static TBootstrap CreateWithHandlesStorage() { + NProto::TFileStoreFeatures features; + features.SetDirectoryHandlesStorageEnabled(true); + return TBootstrap(CreateWallClockTimer(), CreateScheduler(), features); + } }; } // namespace @@ -749,7 +764,9 @@ Y_UNIT_TEST_SUITE(TFileSystemTest) TBootstrap bootstrap; std::atomic numCalls = 0; - bootstrap.Service->ListNodesHandler = [&] (auto callContext, auto request) { + bootstrap.Service->ListNodesHandler = + [&](auto callContext, auto request) + { UNIT_ASSERT_VALUES_EQUAL(FileSystemId, callContext->FileSystemId); static ui64 id = 1; @@ -783,31 +800,313 @@ Y_UNIT_TEST_SUITE(TFileSystemTest) auto handleId = handle.GetValue(); // read dir consists of sequantial reading until empty resposne - auto read = bootstrap.Fuse->SendRequest(nodeId, handleId); + auto read = + bootstrap.Fuse->SendRequest(nodeId, handleId); UNIT_ASSERT(read.Wait(WaitTimeout)); UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 1); auto size1 = read.GetValue(); UNIT_ASSERT(size1 > 0); - read = bootstrap.Fuse->SendRequest(nodeId, handleId, size1); + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + size1 / 2); UNIT_ASSERT(read.Wait(WaitTimeout)); - UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 2); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 1); auto size2 = read.GetValue(); UNIT_ASSERT(size2 > 0); - read = bootstrap.Fuse->SendRequest(nodeId, handleId, size1 + size2); + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + size1); UNIT_ASSERT(read.Wait(WaitTimeout)); UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 2); auto size3 = read.GetValue(); - UNIT_ASSERT_VALUES_EQUAL(size3, 0); + UNIT_ASSERT(size3 > 0); - auto close = bootstrap.Fuse->SendRequest(nodeId, handleId); + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + size1 + size2); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 2); + + auto size4 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size4, 0); + + auto close = + bootstrap.Fuse->SendRequest(nodeId, handleId); UNIT_ASSERT_NO_EXCEPTION(close.GetValue(WaitTimeout)); } + Y_UNIT_TEST(ShouldHandleReadDirLargeDataWithHandlesStoragePaging) + { + auto bootstrap = TBootstrap::CreateWithHandlesStorage(); + + std::atomic numCalls = 0; + bootstrap.Service->ListNodesHandler = + [&](auto callContext, auto request) + { + UNIT_ASSERT_VALUES_EQUAL(FileSystemId, callContext->FileSystemId); + + ++numCalls; + + NProto::TListNodesResponse result; + + if (!request->GetCookie()) { + for (ui32 i = 1; i <= 20000; ++i) { + result.AddNames()->assign( + "first_chunk_file_" + ToString(i) + ".txt"); + auto* node = result.AddNodes(); + node->SetId(100 + i); + node->SetType(NProto::E_REGULAR_NODE); + } + result.SetCookie("has_more_data"); + } else { + UNIT_ASSERT_VALUES_EQUAL("has_more_data", request->GetCookie()); + for (ui32 i = 20001; i <= 25100; ++i) { + result.AddNames()->assign( + "second_chunk_file_" + ToString(i) + ".txt"); + auto* node = result.AddNodes(); + node->SetId(100 + i); + node->SetType(NProto::E_REGULAR_NODE); + } + } + + return MakeFuture(result); + }; + + bootstrap.Start(); + Y_DEFER { + bootstrap.Stop(); + }; + + const ui64 nodeId = 123; + + auto handle = bootstrap.Fuse->SendRequest(nodeId); + UNIT_ASSERT(handle.Wait(WaitTimeout)); + auto handleId = handle.GetValue(); + + auto read = + bootstrap.Fuse->SendRequest(nodeId, handleId); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 1); + + auto size1 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size1, 4096); + + read = + bootstrap.Fuse->SendRequest(nodeId, handleId, 0); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 2); + + auto size2 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size2, 4096); + + auto partialOffset = size1 / 2; + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + partialOffset); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 2); + + auto size3 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size3, 4096); + + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + size1); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 2); + + auto size4 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size4, 4096); + + read = + bootstrap.Fuse->SendRequest(nodeId, handleId, 0); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 3); + + auto size5 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size5, 4096); + + auto largeOffset = size1 + 3700000; // Go beyond the first chunk + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + largeOffset); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 4); + + auto size6 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size6, 4096); + + auto firstChunkOffset = size1 / 3; + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + firstChunkOffset); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 4); + + auto size7 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size7, 4096); + + auto secondChunkOffset = largeOffset + 200; + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + secondChunkOffset); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 4); + + auto size8 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size8, 4096); + + // Open second directory handle for the same nodeId + auto handle2 = bootstrap.Fuse->SendRequest(nodeId); + UNIT_ASSERT(handle2.Wait(WaitTimeout)); + auto handleId2 = handle2.GetValue(); + + read = bootstrap.Fuse->SendRequest(nodeId, handleId2); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 5); + + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId2, + size1); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls.load(), 5); + + auto size9 = read.GetValue(); + UNIT_ASSERT_VALUES_EQUAL(size9, 4096); + + auto close2 = + bootstrap.Fuse->SendRequest(nodeId, handleId2); + UNIT_ASSERT_NO_EXCEPTION(close2.GetValue(WaitTimeout)); + + auto close = + bootstrap.Fuse->SendRequest(nodeId, handleId); + UNIT_ASSERT_NO_EXCEPTION(close.GetValue(WaitTimeout)); + } + + Y_UNIT_TEST(ShouldLoadDirectoryHandlesStorageWithoutErrors) + { + const TString sessionId = CreateGuidAsString(); + NProto::TFileStoreFeatures features; + features.SetDirectoryHandlesStorageEnabled(true); + + auto createSessionHandler = [&](auto, auto) + { + NProto::TCreateSessionResponse result; + result.MutableSession()->SetSessionId(sessionId); + result.MutableFileStore()->MutableFeatures()->CopyFrom(features); + result.MutableFileStore()->SetFileSystemId(FileSystemId); + return MakeFuture(result); + }; + + std::atomic numCalls1 = 0; + std::atomic numCalls2 = 0; + + TFsPath tmpPathForCache( + TFsPath(GetSystemTempDir()) / "directory_handles_storage.dump"); + + TString pathToCache; + + { + auto bootstrap = TBootstrap::CreateWithHandlesStorage(); + + bootstrap.Service->CreateSessionHandler = createSessionHandler; + + bootstrap.Service->ListNodesHandler = + [&](auto callContext, auto request) + { + UNIT_ASSERT_VALUES_EQUAL( + FileSystemId, + callContext->FileSystemId); + + NProto::TListNodesResponse result; + + if (!request->GetCookie()) { + for (ui32 i = 1; i <= 20000; ++i) { + result.AddNames()->assign( + "first_chunk_file_" + ToString(i) + ".txt"); + auto* node = result.AddNodes(); + node->SetId(100 + i); + node->SetType(NProto::E_REGULAR_NODE); + } + ++numCalls1; + result.SetCookie("has_more_data"); + } else { + UNIT_ASSERT_VALUES_EQUAL( + "has_more_data", + request->GetCookie()); + for (ui32 i = 20001; i <= 25100; ++i) { + result.AddNames()->assign( + "second_chunk_file_" + ToString(i) + ".txt"); + auto* node = result.AddNodes(); + node->SetId(100 + i); + node->SetType(NProto::E_REGULAR_NODE); + } + ++numCalls2; + } + + return MakeFuture(result); + }; + + bootstrap.Start(); + Y_DEFER + { + bootstrap.Stop(); + }; + + pathToCache = TFsPath(bootstrap.DirectoryHandlesStoragePath) / + FileSystemId / sessionId / + "directory_handles_storage"; + + const ui64 nodeId = 123; + + auto handle = bootstrap.Fuse->SendRequest(nodeId); + UNIT_ASSERT(handle.Wait(WaitTimeout)); + auto handleId = handle.GetValue(); + + auto read = + bootstrap.Fuse->SendRequest(nodeId, handleId); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls1.load(), 1); + UNIT_ASSERT_VALUES_EQUAL(numCalls2.load(), 0); + + auto largeOffset = 3700000; // Go beyond the first chunk + read = bootstrap.Fuse->SendRequest( + nodeId, + handleId, + largeOffset); + UNIT_ASSERT(read.Wait(WaitTimeout)); + UNIT_ASSERT_VALUES_EQUAL(numCalls1.load(), 1); + UNIT_ASSERT_VALUES_EQUAL(numCalls2.load(), 1); + + TFsPath(pathToCache).CopyTo(tmpPathForCache.GetPath(), true); + } + + { + auto bootstrap = TBootstrap::CreateWithHandlesStorage(); + + tmpPathForCache.CopyTo(pathToCache, true); + + bootstrap.Service->CreateSessionHandler = createSessionHandler; + + bootstrap.Start(); + bootstrap.Stop(); + } + } + Y_UNIT_TEST(ShouldHandleForgetRequestsForUnknownNodes) { TBootstrap bootstrap; @@ -1848,7 +2147,9 @@ Y_UNIT_TEST_SUITE(TFileSystemTest) bootstrap.Fuse->SendRequest(nodeId2, handle2); // Second request should wait until the first request is processed. - UNIT_ASSERT_EXCEPTION(future.GetValue(WaitTimeout), yexception); + UNIT_ASSERT_EXCEPTION( + future.GetValue(ExceptionWaitTimeout), + yexception); UNIT_ASSERT_EQUAL( 1, AtomicGet(counters->GetCounter("InProgress")->GetAtomic())); diff --git a/cloud/filestore/libs/vfs_fuse/loop.cpp b/cloud/filestore/libs/vfs_fuse/loop.cpp index d96248ebd81..787331cc9b4 100644 --- a/cloud/filestore/libs/vfs_fuse/loop.cpp +++ b/cloud/filestore/libs/vfs_fuse/loop.cpp @@ -4,6 +4,7 @@ #include "fs.h" #include "fuse.h" #include "handle_ops_queue.h" +#include "directory_handles_storage.h" #include "log.h" #include @@ -54,6 +55,7 @@ namespace { static constexpr TStringBuf HandleOpsQueueFileName = "handle_ops_queue"; static constexpr TStringBuf WriteBackCacheFileName = "write_back_cache"; +static constexpr TStringBuf DirectoryHandlesStorageFileName = "directory_handles_storage"; NProto::TError CreateAndLockFile( const TString& dir, @@ -641,9 +643,11 @@ class TFileSystemLoop final THolder HandleOpsQueueFileLock; THolder WriteBackCacheFileLock; + THolder DirectoryHandlesStorageFileLock; bool HandleOpsQueueInitialized = false; bool WriteBackCacheInitialized = false; + bool DirectoryHandlesStorageInitialized = false; public: TFileSystemLoop( @@ -775,6 +779,18 @@ class TFileSystemLoop final } } + if (p->DirectoryHandlesStorageInitialized) { + auto error = UnlockAndDeleteFile( + TFsPath( + p->Config->GetDirectoryHandlesStoragePath()) / + p->Config->GetFileSystemId() / p->SessionId, + p->DirectoryHandlesStorageFileLock); + if (HasError(error)) { + ReportDirectoryHandlesStorageError( + error.GetMessage()); + } + } + s.SetValue(); }); }; @@ -1019,6 +1035,33 @@ class TFileSystemLoop final } } + TDirectoryHandlesStoragePtr directoryHandlesStorage; + if (FileSystemConfig->GetDirectoryHandlesStorageEnabled() && + Config->GetDirectoryHandlesStoragePath()) + { + auto path = TFsPath(Config->GetDirectoryHandlesStoragePath()) / + FileSystemConfig->GetFileSystemId() / SessionId; + + auto error = CreateAndLockFile( + path, + DirectoryHandlesStorageFileName, + DirectoryHandlesStorageFileLock); + + if (HasError(error)) { + ReportDirectoryHandlesStorageError(error.GetMessage()); + return error; + } + + directoryHandlesStorage = CreateDirectoryHandlesStorage( + Log, + path / DirectoryHandlesStorageFileName, + FileSystemConfig->GetDirectoryHandlesTableSize(), + Config->GetDirectoryHandlesInitialDataSize(), + FileSystemConfig->GetMaxBufferSize()); + + DirectoryHandlesStorageInitialized = true; + } + FileSystem = CreateFileSystem( Logging, ProfileLog, @@ -1029,6 +1072,7 @@ class TFileSystemLoop final RequestStats, CompletionQueue, std::move(handleOpsQueue), + std::move(directoryHandlesStorage), std::move(writeBackCache)); RequestStats->RegisterIncompleteRequestProvider(CompletionQueue); @@ -1124,6 +1168,12 @@ class TFileSystemLoop final config.SetServerWriteBackCacheEnabled( features.GetServerWriteBackCacheEnabled()); + config.SetDirectoryHandlesStorageEnabled( + features.GetDirectoryHandlesStorageEnabled()); + + config.SetDirectoryHandlesTableSize( + features.GetDirectoryHandlesTableSize()); + config.SetZeroCopyEnabled(features.GetZeroCopyEnabled()); config.SetGuestPageCacheDisabled(features.GetGuestPageCacheDisabled()); diff --git a/cloud/filestore/libs/vfs_fuse/public.h b/cloud/filestore/libs/vfs_fuse/public.h index 5501634621c..e817b7621ea 100644 --- a/cloud/filestore/libs/vfs_fuse/public.h +++ b/cloud/filestore/libs/vfs_fuse/public.h @@ -23,4 +23,7 @@ using ICompletionQueuePtr = std::shared_ptr; class THandleOpsQueue; using THandleOpsQueuePtr = std::unique_ptr; +class TDirectoryHandlesStorage; +using TDirectoryHandlesStoragePtr = std::unique_ptr; + } // namespace NCloud::NFileStore::NFuse diff --git a/cloud/filestore/libs/vfs_fuse/ya.make.inc b/cloud/filestore/libs/vfs_fuse/ya.make.inc index a1e5ff016a4..c5f4de5e7b6 100644 --- a/cloud/filestore/libs/vfs_fuse/ya.make.inc +++ b/cloud/filestore/libs/vfs_fuse/ya.make.inc @@ -1,6 +1,8 @@ SRCS( config.cpp + directory_handles_storage.cpp fs.cpp + fs_directory_handle.cpp fs_impl.cpp fs_impl_attr.cpp fs_impl_data.cpp diff --git a/cloud/filestore/public/api/protos/fs.proto b/cloud/filestore/public/api/protos/fs.proto index 95aad346fe4..d3d34768109 100644 --- a/cloud/filestore/public/api/protos/fs.proto +++ b/cloud/filestore/public/api/protos/fs.proto @@ -40,6 +40,8 @@ message TFileStoreFeatures uint32 MaxFuseLoopThreads = 25; bool ZeroCopyWriteEnabled = 26; bool FSyncQueueDisabled = 27; + bool DirectoryHandlesStorageEnabled = 28; + uint64 DirectoryHandlesTableSize = 29; } message TFileStore diff --git a/cloud/filestore/tests/common_configs/nfs-storage-newfeatures-patch.txt b/cloud/filestore/tests/common_configs/nfs-storage-newfeatures-patch.txt index ed02a899101..dcd2c3b7c0b 100644 --- a/cloud/filestore/tests/common_configs/nfs-storage-newfeatures-patch.txt +++ b/cloud/filestore/tests/common_configs/nfs-storage-newfeatures-patch.txt @@ -28,3 +28,5 @@ ZeroCopyWriteEnabled: true GuestKeepCacheAllowed: true SessionHandleOffloadedStatsCapacity: 100 GuestCachingType: GCT_ANY_READ +DirectoryHandlesStorageEnabled: true +DirectoryHandlesTableSize: 1200 diff --git a/cloud/filestore/tests/permanent_dirs/canondata/result.json b/cloud/filestore/tests/permanent_dirs/canondata/result.json new file mode 100644 index 00000000000..451190779e7 --- /dev/null +++ b/cloud/filestore/tests/permanent_dirs/canondata/result.json @@ -0,0 +1,5 @@ +{ + "test.test": { + "uri": "file://test.test/results.txt" + } +} diff --git a/cloud/filestore/tests/permanent_dirs/canondata/test.test/results.txt b/cloud/filestore/tests/permanent_dirs/canondata/test.test/results.txt new file mode 100644 index 00000000000..7326d960397 --- /dev/null +++ b/cloud/filestore/tests/permanent_dirs/canondata/test.test/results.txt @@ -0,0 +1 @@ +Ok diff --git a/cloud/filestore/tests/permanent_dirs/test.py b/cloud/filestore/tests/permanent_dirs/test.py new file mode 100644 index 00000000000..a57b088ec6d --- /dev/null +++ b/cloud/filestore/tests/permanent_dirs/test.py @@ -0,0 +1,105 @@ +import logging +import os +import time +from time import sleep + +import retrying +from retrying import retry + +import yatest.common as common + +import cloud.filestore.public.sdk.python.protos as protos + +from cloud.filestore.public.sdk.python.client.grpc_client import CreateGrpcEndpointClient, CreateGrpcClient +from cloud.filestore.tests.python.lib.common import is_grpc_error + +from cloud.storage.core.tools.testing.qemu.lib.common import ( + env_with_guest_index, + SshToGuest, +) + +RETRY_COUNT = 3 +WAIT_TIMEOUT_MS = 1000 +MAX_DIRS = 25000 +WAIT_FILE_TIMEOUT_MS = 15000 + + +@retrying.retry(stop_max_delay=60000, wait_fixed=1000, retry_on_exception=is_grpc_error) +def wait_for_filestore_vhost(port, port_type="endpoint"): + if port_type == "endpoint": + with CreateGrpcEndpointClient(str("localhost:%d" % port)) as grpc_client: + grpc_client.ping(protos.TPingRequest()) + elif port_type == "filestore": + with CreateGrpcClient(str("localhost:%d" % port)) as grpc_client: + grpc_client.ping(protos.TPingRequest()) + else: + raise Exception(f"Invalid port type {port_type}") + + +@retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=WAIT_TIMEOUT_MS) +def create_dirs_bulk(ssh: SshToGuest, parent_dir: str): + command = f"for i in {{1..{MAX_DIRS}}}; do sudo mkdir -p {parent_dir}/dirname$(printf \"%03d\" $i); done" + return ssh(command) + + +@retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=WAIT_TIMEOUT_MS) +def create_dir(ssh: SshToGuest, dir: str, dir_name: str): + return ssh(f"sudo mkdir -p {dir}/{dir_name}") + + +@retry(stop_max_attempt_number=RETRY_COUNT, wait_fixed=WAIT_TIMEOUT_MS) +def create_file(ssh: SshToGuest, dir: str, file_name: str): + return ssh(f"sudo touch {dir}/{file_name}") + + +def test(): + logger = logging.getLogger("test") + vhost_port = os.getenv("NFS_VHOST_PORT") + + port = int(os.getenv(env_with_guest_index("QEMU_FORWARDING_PORT", 0))) + ssh_key = os.getenv("QEMU_SSH_KEY") + mount_dir = os.getenv("NFS_MOUNT_PATH") + + ssh = SshToGuest(user="qemu", port=port, key=ssh_key) + + readdir_bin_path = common.binary_path("cloud/filestore/tools/testing/directory_handles_state_test/directory_handles_state_test") + + dir_for_open_restart = "test_dir" + file_for_script_continue = "test_file_continue" + + full_dir_for_open_restart = f"{mount_dir}/{dir_for_open_restart}" + full_file_for_script_continue = f"{mount_dir}/{file_for_script_continue}" + + res1 = create_dir(ssh, mount_dir, dir_for_open_restart) + res1 = create_dirs_bulk(ssh, full_dir_for_open_restart) + assert 0 == res1.returncode + + ssh( + f"{readdir_bin_path} {full_dir_for_open_restart} {full_file_for_script_continue} {WAIT_FILE_TIMEOUT_MS} {MAX_DIRS} > /tmp/dir_for_open_restart.log 2>&1 &") + + restart_flag_on_demand = os.getenv("VHOST_RESTART_FLAG_ON_DEMAND") + + if restart_flag_on_demand is None: + exit(1) + + logger.info(f"creating flag file time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}.{int(time.time() * 1000) % 1000:03d}") + if restart_flag_on_demand: + ssh(f"sudo touch {restart_flag_on_demand}") + sleep(int(os.getenv("VHOST_RESTART_INTERVAL")) * 0.5) + ssh(f"sudo rm -f {restart_flag_on_demand}") + + logger.info(f"wait_for_filestore_vhost time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}.{int(time.time() * 1000) % 1000:03d}") + wait_for_filestore_vhost(int(vhost_port)) + logger.info(f"wait_for_filestore_vhost finished time: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}.{int(time.time() * 1000) % 1000:03d}") + + res1 = create_file(ssh, mount_dir, file_for_script_continue) + assert 0 == res1.returncode + + ret = ssh("sudo cat /tmp/dir_for_open_restart.log") + + results_path = common.output_path() + "/results.txt" + with open(results_path, 'w') as results: + results.write(ret.stdout.decode('utf8')) + + ret = common.canonical_file(results_path, local=True) + return ret diff --git a/cloud/filestore/tests/permanent_dirs/ya.make b/cloud/filestore/tests/permanent_dirs/ya.make new file mode 100644 index 00000000000..e06ad963b7a --- /dev/null +++ b/cloud/filestore/tests/permanent_dirs/ya.make @@ -0,0 +1,41 @@ +PY3TEST() + +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/large.inc) +SPLIT_FACTOR(1) + +TEST_SRCS( + test.py +) + +DEPENDS( + cloud/filestore/tools/testing/directory_handles_state_test +) + +PEERDIR( + cloud/filestore/public/sdk/python/client + cloud/filestore/tests/python/lib + + cloud/storage/core/tools/testing/qemu/lib +) + +SET( + NFS_STORAGE_CONFIG_PATCH + cloud/filestore/tests/common_configs/nfs-storage-newfeatures-patch.txt +) + +SET(QEMU_VIRTIO fs) +SET(QEMU_INSTANCE_COUNT 1) +SET(FILESTORE_VHOST_ENDPOINT_COUNT 1) +SET(VIRTIOFS_SERVER_COUNT 1) +SET(QEMU_INVOKE_TEST NO) + +SET(VHOST_RESTART_INTERVAL 10) +SET(VHOST_RESTART_FLAG 1) +SET(VHOST_RESTART_FLAG_ON_DEMAND YES) + +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/service-kikimr.inc) +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-kikimr.inc) +INCLUDE(${ARCADIA_ROOT}/cloud/filestore/tests/recipes/vhost-endpoint.inc) +INCLUDE(${ARCADIA_ROOT}/cloud/storage/core/tests/recipes/qemu.inc) + +END() diff --git a/cloud/filestore/tests/recipes/vhost-kikimr.inc b/cloud/filestore/tests/recipes/vhost-kikimr.inc index 594b514a066..ffe1cb06df0 100644 --- a/cloud/filestore/tests/recipes/vhost-kikimr.inc +++ b/cloud/filestore/tests/recipes/vhost-kikimr.inc @@ -15,6 +15,10 @@ SET(RECIPE_ARGS --restart-flag $VHOST_RESTART_FLAG ) +IF (VHOST_RESTART_FLAG_ON_DEMAND) + SET_APPEND(RECIPE_ARGS --restart-flag-on-demand) +ENDIF() + IF (NFS_STORAGE_CONFIG_PATCH) SET_APPEND(RECIPE_ARGS --storage-config-patch $NFS_STORAGE_CONFIG_PATCH) ENDIF() diff --git a/cloud/filestore/tests/recipes/vhost/__main__.py b/cloud/filestore/tests/recipes/vhost/__main__.py index 95ac8d5baa8..96589caa40c 100644 --- a/cloud/filestore/tests/recipes/vhost/__main__.py +++ b/cloud/filestore/tests/recipes/vhost/__main__.py @@ -41,6 +41,7 @@ def start(argv): parser.add_argument("--service", action="store", default=None) parser.add_argument("--restart-interval", action="store", default=None) parser.add_argument("--restart-flag", action="store", default=None) + parser.add_argument("--restart-flag-on-demand", action="store_true", default=False) parser.add_argument("--storage-config-patch", action="store", default=None) parser.add_argument("--direct-io", action="store_true", default=False) parser.add_argument("--use-unix-socket", action="store_true", default=False) @@ -79,6 +80,11 @@ def start(argv): pathlib.Path(write_back_cache_path).mkdir(parents=True, exist_ok=True) config.VhostServiceConfig.WriteBackCachePath = write_back_cache_path + directory_handles_storage_path = common.work_path() + "/directoryhandlesstorage-" + uid + pathlib.Path(directory_handles_storage_path).mkdir(parents=True, exist_ok=True) + config.VhostServiceConfig.DirectoryHandlesStoragePath = directory_handles_storage_path + config.VhostServiceConfig.DirectoryHandlesInitialDataSize = 1000100 + service_type = args.service or "local" kikimr_port = 0 domain = None @@ -163,7 +169,10 @@ def start(argv): wait_for_filestore_vhost(filestore_vhost, vhost_configurator.port) if restart_interval: - if restart_flag is not None: + set_env("VHOST_RESTART_INTERVAL", str(restart_interval)) + if args.restart_flag_on_demand: + set_env("VHOST_RESTART_FLAG_ON_DEMAND", restart_flag) + elif restart_flag is not None: set_env("QEMU_SET_READY_FLAG", restart_flag) set_env("NFS_VHOST_PORT", str(vhost_configurator.port)) diff --git a/cloud/filestore/tools/testing/directory_handles_state_test/main.cpp b/cloud/filestore/tools/testing/directory_handles_state_test/main.cpp new file mode 100644 index 00000000000..12747ed6d51 --- /dev/null +++ b/cloud/filestore/tools/testing/directory_handles_state_test/main.cpp @@ -0,0 +1,278 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +//////////////////////////////////////////////////////////////////////////////// + +constexpr int NumTestPositions = 200; +constexpr int MinDirectories = 10000; +constexpr size_t MaxFilenameLen = 256; +constexpr int SleepIntervalMs = 100; +constexpr int MsToUs = 1000; +constexpr int NumParallelTests = 10; + +//////////////////////////////////////////////////////////////////////////////// + +struct TDirEntry +{ + long Position; + char Name[MaxFilenameLen]; +}; + +struct TTestContext +{ + std::string Name; + std::string DirectoryPath; + std::string WaitFilePath; + int TimeoutMs; + int NumDirectories; + + DIR* DirHandle = nullptr; + std::vector Entries; + std::vector TestPositions; + bool Success = true; + + ~TTestContext() + { + if (DirHandle) { + closedir(DirHandle); + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +std::mutex g_OutputMutex; + +//////////////////////////////////////////////////////////////////////////////// + +bool PrepareDirectoryData(TTestContext& ctx) +{ + // Open directory + ctx.DirHandle = opendir(ctx.DirectoryPath.c_str()); + if (!ctx.DirHandle) { + std::lock_guard lock(g_OutputMutex); + std::cerr << "[" << ctx.Name + << "] Failed to open directory: " << ctx.DirectoryPath + << std::endl; + return false; + } + + // Read and cache directory entries + ctx.Entries.reserve(ctx.NumDirectories); + struct dirent* entry; + int entryCount = 0; + + while (entryCount < ctx.NumDirectories) { + long pos = telldir(ctx.DirHandle); + entry = readdir(ctx.DirHandle); + + if (!entry) { + break; + } + + if (entry->d_name[0] != '.') { + TDirEntry dirEntry; + dirEntry.Position = pos; + std::strncpy(dirEntry.Name, entry->d_name, MaxFilenameLen - 1); + dirEntry.Name[MaxFilenameLen - 1] = '\0'; + + ctx.Entries.push_back(dirEntry); + ++entryCount; + } + } + + if (ctx.Entries.empty()) { + std::lock_guard lock(g_OutputMutex); + std::cerr << "[" << ctx.Name << "] No directory entries found" + << std::endl; + return false; + } + + // Generate random test positions using thread-safe random generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution<> dis(0, ctx.Entries.size() - 1); + + ctx.TestPositions.resize(NumTestPositions); + for (int i = 0; i < NumTestPositions; ++i) { + ctx.TestPositions[i] = dis(gen); + } + + return true; +} + +bool TestDirectory(TTestContext& ctx) +{ + bool hasErrors = false; + int timeoutRemaining = ctx.TimeoutMs; + + for (int i = 0; i < NumTestPositions; ++i) { + int targetIdx = ctx.TestPositions[i]; + if (targetIdx >= static_cast(ctx.Entries.size())) { + continue; + } + + // Seek to test position + long seekPos = ctx.Entries[targetIdx].Position; + seekdir(ctx.DirHandle, seekPos); + + // Wait for restart signal at second iteration + if (i == 1) { + struct stat waitSt; + while (stat(ctx.WaitFilePath.c_str(), &waitSt) == -1 && + timeoutRemaining > 0) + { + usleep(SleepIntervalMs * MsToUs); + timeoutRemaining -= SleepIntervalMs; + } + + if (timeoutRemaining <= 0) { + std::lock_guard lock(g_OutputMutex); + std::cerr << "[" << ctx.Name + << "] Error: timeout waiting for file " + << ctx.WaitFilePath << std::endl; + return false; + } + } + + // Read and verify entry + struct dirent* entry = readdir(ctx.DirHandle); + if (!entry) { + std::lock_guard lock(g_OutputMutex); + std::cerr << "[" << ctx.Name + << "] Error: readdir returned NULL at iteration " << i + << ", position " << targetIdx << ", seekPos " << seekPos + << std::endl; + hasErrors = true; + continue; + } + + if (std::strcmp(entry->d_name, ctx.Entries[targetIdx].Name) != 0) { + std::lock_guard lock(g_OutputMutex); + std::cerr << "[" << ctx.Name << "] Error: mismatch at iteration " + << i << ", position " << targetIdx << ", seekPos " + << seekPos << " - expected '" + << ctx.Entries[targetIdx].Name << "', got '" + << entry->d_name << "'" << std::endl; + hasErrors = true; + } + } + + return !hasErrors; +} + +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +// this tool is used to confirm that we can use directory listing consistently +// despite vhost restart. the flow as follows: +// 1. create a directory with a lot of files +// 2. open the directory and read the files +// 3. restart the vhost +// 4. open the directory and read the files +// 5. check that the files are the same +// 6. repeat the process 10 times concurrently (this way internal cache data is +// interleaved) +int main(int argc, char** argv) +{ + if (argc != 5) { + std::cerr << "Usage: " << argv[0] + << " " + << std::endl; + return 1; + } + + std::string directoryPath = argv[1]; + std::string waitFilePath = argv[2]; + int timeoutMs = std::atoi(argv[3]); + int numDirectories = std::atoi(argv[4]); + + if (numDirectories < MinDirectories) { + std::cerr << "Error: Number of directories (" << numDirectories + << ") must be at least " << MinDirectories << std::endl; + return 1; + } + + if (timeoutMs <= 0) { + std::cerr << "Error: Timeout must be positive" << std::endl; + return 1; + } + + // Create test contexts for the same directory (parallel testing) + std::vector contexts; + + for (int i = 1; i <= NumParallelTests; ++i) { + TTestContext ctx; + ctx.Name = "dir" + std::to_string(i); + ctx.DirectoryPath = directoryPath; + ctx.WaitFilePath = waitFilePath; + ctx.TimeoutMs = timeoutMs; + ctx.NumDirectories = numDirectories; + contexts.push_back(std::move(ctx)); + } + + // Phase 1: Prepare directory data in parallel + { + std::vector threads; + for (auto& ctx: contexts) { + threads.emplace_back( + [&ctx]() + { + if (!PrepareDirectoryData(ctx)) { + ctx.Success = false; + } + }); + } + + for (auto& thread: threads) { + thread.join(); + } + + // Check if all preparations succeeded + for (const auto& ctx: contexts) { + if (!ctx.Success) { + std::cerr << "[" << ctx.Name + << "] Failed to prepare directory data" << std::endl; + return 1; + } + } + } + + // Phase 2: Test directories in parallel + { + std::vector threads; + for (auto& ctx: contexts) { + threads.emplace_back([&ctx]() + { ctx.Success = TestDirectory(ctx); }); + } + + for (auto& thread: threads) { + thread.join(); + } + } + + // Check results + for (const auto& ctx: contexts) { + if (!ctx.Success) { + std::cout << "errors detected" << std::endl; + return 1; + } + } + + std::cout << "Ok" << std::endl; + + return 0; +} diff --git a/cloud/filestore/tools/testing/directory_handles_state_test/ya.make b/cloud/filestore/tools/testing/directory_handles_state_test/ya.make new file mode 100644 index 00000000000..422edcb8cf7 --- /dev/null +++ b/cloud/filestore/tools/testing/directory_handles_state_test/ya.make @@ -0,0 +1,7 @@ +PROGRAM(directory_handles_state_test) + +SRCS( + main.cpp +) + +END() diff --git a/cloud/filestore/tools/testing/ya.make b/cloud/filestore/tools/testing/ya.make index e466f3159ad..39904898879 100644 --- a/cloud/filestore/tools/testing/ya.make +++ b/cloud/filestore/tools/testing/ya.make @@ -2,5 +2,6 @@ RECURSE( fs_posix_compliance loadtest open_close_bench + directory_handles_state_test profile_log )