Skip to content

Commit cdf543c

Browse files
committed
Faster search by file contents using file memory mapping
1 parent 366d1d1 commit cdf543c

File tree

4 files changed

+108
-39
lines changed

4 files changed

+108
-39
lines changed

file-commander-core/src/cfilemanipulator.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ FileOperationResultCode CFileManipulator::copyChunk(const uint64_t chunkSize, co
159159

160160
_destinationFilePath = destFolder + (newName.isEmpty() ? _srcObject.fullName() : newName);
161161

162-
if (!_destFile.open(_destinationFilePath.toUtf8().constData(), thin_io::file_definitions::Write)) [[unlikely]]
162+
if (!_destFile.open(_destinationFilePath.toUtf8().constData(), thin_io::file::open_mode::Write)) [[unlikely]]
163163
{
164164
_lastErrorMessage = getLastFileError();
165165

file-commander-core/src/filesearchengine/cfilesearchengine.cpp

Lines changed: 105 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,19 @@
33
#include "directoryscanner.h"
44

55
#include "qtcore_helpers/qstring_helpers.hpp"
6+
#include "file.hpp"
67

78
#include "hash/jenkins_hash.hpp"
89
#include "threading/thread_helpers.h"
10+
#include "utility_functions/memory_functions.h"
911

1012
DISABLE_COMPILER_WARNINGS
1113
#include <QDebug>
1214
#include <QRegularExpression>
1315
#include <QTextStream>
1416
RESTORE_COMPILER_WARNINGS
1517

16-
static QString queryToRegex(const QString& query)
18+
[[nodiscard]] static QString queryToRegex(const QString& query, bool startToEnd)
1719
{
1820
// Escape the dots
1921
QString regExString = QString{ query }.replace('.', QLatin1StringView{ "\\." });
@@ -22,18 +24,102 @@ static QString queryToRegex(const QString& query)
2224
if (nameQueryHasWildcards)
2325
{
2426
regExString.replace('?', '.').replace('*', ".*");
25-
regExString.prepend("\\A").append("\\z");
27+
//regExString.prepend("\\A").append("\\z");
2628
}
2729

28-
if (!regExString.startsWith('^'))
29-
regExString.prepend('^');
30+
if (startToEnd)
31+
{
32+
if (!regExString.startsWith('^'))
33+
regExString.prepend('^');
3034

31-
if (!regExString.endsWith('$'))
32-
regExString.append('$');
35+
if (!regExString.endsWith('$'))
36+
regExString.append('$');
37+
}
3338

3439
return regExString;
3540
}
3641

42+
#ifndef __ARM_ARCH_ISA_A64
43+
44+
#include <smmintrin.h> // SSE4.1
45+
46+
inline void replace_byte(uint8_t* array, size_t size) noexcept
47+
{
48+
const __m128i old_sse = _mm_set1_epi8(0);
49+
const __m128i new_sse = _mm_set1_epi8(' ');
50+
51+
for (size_t i = 0; i < size; i += 16)
52+
{
53+
__m128i data = _mm_loadu_si128(reinterpret_cast<__m128i*>(array + i)); // Load 16 bytes
54+
__m128i mask = _mm_cmpeq_epi8(data, old_sse); // Compare with old_value
55+
__m128i result = _mm_blendv_epi8(data, new_sse, mask); // Blend new_value where mask is true
56+
_mm_storeu_si128(reinterpret_cast<__m128i*>(array + i), result); // Store the result back
57+
}
58+
}
59+
60+
#else // ARM64
61+
62+
#include <arm_neon.h>
63+
64+
inline void replace_byte(uint8_t* array, size_t size)
65+
{
66+
uint8x16_t old_neon = vdupq_n_u8(0); // Duplicate old_value across all 16 bytes in the vector
67+
uint8x16_t new_neon = vdupq_n_u8(' '); // Duplicate new_value across all 16 bytes in the vector
68+
69+
for (size_t i = 0; i < size; i += 16)
70+
{
71+
uint8x16_t data = vld1q_u8(&array[i]); // Load 16 bytes
72+
uint8x16_t mask = vceqq_u8(data, old_neon); // Compare with old_value
73+
uint8x16_t result = vbslq_u8(mask, new_neon, data); // Select new_value where mask is true, else original value
74+
vst1q_u8(&array[i], result); // Store the result back
75+
}
76+
}
77+
78+
#endif
79+
80+
[[nodiscard]] static bool fileContentsMatches(const QString& path, const QRegularExpression& regex)
81+
{
82+
thin_io::file file;
83+
if (!file.open(path.toUtf8().constData(), thin_io::file::open_mode::Read)) [[unlikely]]
84+
return false;
85+
86+
const auto fileSize = file.size().value_or(0);
87+
if (fileSize == 0) [[unlikely]]
88+
return false;
89+
90+
static constexpr auto toBytePtr = [](const void* ptr) -> const std::byte* {
91+
return reinterpret_cast<const std::byte*>(ptr);
92+
};
93+
94+
auto* mappedFile = toBytePtr(file.mmap(thin_io::file::mmap_access_mode::ReadOnly, 0, fileSize));
95+
if (!mappedFile) [[unlikely]]
96+
{
97+
assert_debug_only(mappedFile);
98+
return false;
99+
}
100+
101+
102+
static constexpr size_t maxLineLength = 8 * 1024;
103+
char buffer[maxLineLength];
104+
105+
for (size_t offset = 0; offset < fileSize; )
106+
{
107+
const auto maxSearchLength = std::min(fileSize - offset, maxLineLength);
108+
const auto lineStart = mappedFile + offset;
109+
offset += maxSearchLength;
110+
111+
::memcpy(buffer, lineStart, maxSearchLength);
112+
replace_byte((uint8_t*)buffer, (maxSearchLength + 15) / 16);
113+
114+
QString line = QString::fromUtf8(buffer, maxSearchLength);
115+
assert(!line.isEmpty());
116+
if (regex.match(line).hasMatch())
117+
return true;
118+
}
119+
120+
return false;
121+
}
122+
37123
CFileSearchEngine::CFileSearchEngine(CController& controller) :
38124
_controller(controller),
39125
_workerThread("File search thread")
@@ -92,7 +178,7 @@ void CFileSearchEngine::searchThread(const QString& what, bool subjectCaseSensit
92178
QRegularExpression queryRegExp;
93179
if (!noFileNameFilter)
94180
{
95-
queryRegExp.setPattern(queryToRegex(what));
181+
queryRegExp.setPattern(queryToRegex(what, true));
96182
assert_r(queryRegExp.isValid());
97183

98184
if (!subjectCaseSensitive)
@@ -104,34 +190,29 @@ void CFileSearchEngine::searchThread(const QString& what, bool subjectCaseSensit
104190
QRegularExpression fileContentsRegExp;
105191
if (searchByContents)
106192
{
107-
if (contentsToFind.contains(QRegularExpression(QSL("[*?]"))))
108-
{
109-
fileContentsRegExp.setPattern(QRegularExpression::wildcardToRegularExpression(contentsToFind));
110-
assert_r(fileContentsRegExp.isValid());
111-
}
112-
else if (contentsWholeWords)
113-
{
114-
fileContentsRegExp.setPattern("\\b" + contentsToFind + "\\b");
115-
assert_r(fileContentsRegExp.isValid());
116-
}
193+
QString pattern = queryToRegex(contentsToFind, false);
194+
if (contentsWholeWords)
195+
pattern.prepend("\\b").append("\\b");
196+
197+
fileContentsRegExp.setPattern(pattern);
117198

118199
if (!contentsCaseSensitive)
119200
fileContentsRegExp.setPatternOptions(QRegularExpression::CaseInsensitiveOption);
120-
}
121201

122-
const bool useFileContentsRegExp = !fileContentsRegExp.pattern().isEmpty();
202+
assert_r(fileContentsRegExp.isValid());
203+
}
123204

124205
const int uniqueJobTag = static_cast<int>(jenkins_hash("CFileSearchEngine")) + rand();
125206

126207
QString line;
127208

209+
const QByteArray contentsUtf8 = contentsToFind.toUtf8();
210+
128211
for (const QString& pathToLookIn : where)
129212
{
130213
scanDirectory(CFileSystemObject(pathToLookIn),
131214
[&](const CFileSystemObject& item) {
132215

133-
++itemCounter;
134-
135216
if (itemCounter % 8192 == 0)
136217
{
137218
// No need to report every single item and waste CPU cycles
@@ -141,6 +222,8 @@ void CFileSearchEngine::searchThread(const QString& what, bool subjectCaseSensit
141222
}, uniqueJobTag);
142223
}
143224

225+
++itemCounter;
226+
144227
if (searchByContents && !item.isFile())
145228
return;
146229

@@ -152,21 +235,7 @@ void CFileSearchEngine::searchThread(const QString& what, bool subjectCaseSensit
152235
bool matchFound = false;
153236

154237
if (searchByContents)
155-
{
156-
QFile file{ item.fullAbsolutePath() };
157-
if (!file.open(QFile::ReadOnly))
158-
return;
159-
160-
const auto contentsCaseSensitivity = contentsCaseSensitive ? Qt::CaseSensitive : Qt::CaseInsensitive;
161-
162-
QTextStream stream{ &file };
163-
164-
while (!matchFound && !_workerThread.terminationFlag() && stream.readLineInto(&line))
165-
{
166-
// contains() is faster than RegEx match (as of Qt 5.4.2, but this was for QRegExp, not tested with QRegularExpression)
167-
matchFound = useFileContentsRegExp ? fileContentsRegExp.match(line).hasMatch() : line.contains(contentsToFind, contentsCaseSensitivity);
168-
}
169-
}
238+
matchFound = fileContentsMatches(item.fullAbsolutePath(), fileContentsRegExp);
170239
else
171240
matchFound = nameMatches;
172241

0 commit comments

Comments
 (0)