Skip to content

Commit b36c16b

Browse files
committed
Implement single shard use-case for rpoplpush. Some BLPOP related refactoring
1 parent d3764ef commit b36c16b

File tree

7 files changed

+167
-130
lines changed

7 files changed

+167
-130
lines changed

TODO.md

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1-
1. To move lua_project to dragonfly from helio
1+
1. To move lua_project to dragonfly from helio (DONE)
22
2. To limit lua stack to something reasonable like 4096.
3-
3. To inject our own allocator to lua to track its memory.
3+
3. To inject our own allocator to lua to track its memory.
4+
5+
6+
## Object lifecycle and thread-safety.
7+
8+
Currently our transactional and locking model is based on an assumption that any READ or WRITE
9+
access to objects must be performed in a shard where they belong.
10+
11+
However, this assumption can be relaxed to get significant gains for read-only queries.
12+
13+
### Explanation
14+
Our transactional framework prevents from READ-locked objects to be mutated. It does not prevent from their PrimaryTable to grow or change, of course. These objects can move to different entries inside the table. However, our CompactObject maintains the following property - its reference CompactObject.AsRef() is valid no matter where the master object moves and it's valid and safe for reading even from other threads. The exception regarding thread safety is SmallString which uses translation table for its pointers.
15+
16+
If we change the SmallString translation table to be global and thread-safe (it should not have lots of write contention anyway) we may access primetable keys and values from another thread and write them directly to sockets.
17+
18+
Use-case: large strings that need to be copied. Sets that need to be serialized for SMEMBERS/HGETALL commands etc. Additional complexity - we will need to lock those variables even for single hop transactions and unlock them afterwards. The unlocking hop does not need to increase user-visible latency since it can be done after we send reply to the socket.

src/server/db_slice.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ pair<PrimeIterator, ExpireIterator> DbSlice::FindExt(DbIndex db_ind, string_view
202202
return make_pair(it, ExpireIterator{});
203203
}
204204

205-
OpResult<pair<PrimeIterator, unsigned>> DbSlice::FindFirst(DbIndex db_index, const ArgSlice& args) {
205+
OpResult<pair<PrimeIterator, unsigned>> DbSlice::FindFirst(DbIndex db_index, ArgSlice args) {
206206
DCHECK(!args.empty());
207207

208208
for (unsigned i = 0; i < args.size(); ++i) {

src/server/db_slice.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,9 @@ class DbSlice {
124124
// Returns (value, expire) dict entries if key exists, null if it does not exist or has expired.
125125
std::pair<PrimeIterator, ExpireIterator> FindExt(DbIndex db_ind, std::string_view key) const;
126126

127-
// Returns dictEntry, args-index if found, KEY_NOTFOUND otherwise.
127+
// Returns (iterator, args-index) if found, KEY_NOTFOUND otherwise.
128128
// If multiple keys are found, returns the first index in the ArgSlice.
129-
OpResult<std::pair<PrimeIterator, unsigned>> FindFirst(DbIndex db_index, const ArgSlice& args);
129+
OpResult<std::pair<PrimeIterator, unsigned>> FindFirst(DbIndex db_index, ArgSlice args);
130130

131131
// Return .second=true if insertion ocurred, false if we return the existing key.
132132
std::pair<PrimeIterator, bool> AddOrFind(DbIndex db_ind, std::string_view key);

src/server/list_family.cc

Lines changed: 145 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,75 @@ bool ElemCompare(const quicklistEntry& entry, string_view elem) {
114114
return elem == an.Piece();
115115
}
116116

117+
using FFResult = pair<PrimeKey, unsigned>; // key, argument index.
118+
119+
struct ShardFFResult {
120+
PrimeKey key;
121+
ShardId sid = kInvalidSid;
122+
};
123+
124+
OpResult<ShardFFResult> FindFirst(Transaction* trans) {
125+
VLOG(2) << "FindFirst::Find " << trans->DebugId();
126+
127+
// Holds Find results: (iterator to a found key, and its index in the passed arguments).
128+
// See DbSlice::FindFirst for more details.
129+
// spans all the shards for now.
130+
std::vector<OpResult<FFResult>> find_res(trans->shard_set()->size());
131+
fill(find_res.begin(), find_res.end(), OpStatus::KEY_NOTFOUND);
132+
133+
auto cb = [&find_res](auto* t, EngineShard* shard) {
134+
auto args = t->ShardArgsInShard(shard->shard_id());
135+
OpResult<pair<PrimeIterator, unsigned>> ff_res =
136+
shard->db_slice().FindFirst(t->db_index(), args);
137+
138+
if (ff_res) {
139+
FFResult ff_result(ff_res->first->first.AsRef(), ff_res->second);
140+
find_res[shard->shard_id()] = move(ff_result);
141+
} else {
142+
find_res[shard->shard_id()] = ff_res.status();
143+
}
144+
return OpStatus::OK;
145+
};
146+
147+
trans->Execute(move(cb), false);
148+
149+
uint32_t min_arg_indx = UINT32_MAX;
150+
151+
ShardFFResult shard_result;
152+
153+
for (size_t sid = 0; sid < find_res.size(); ++sid) {
154+
const auto& fr = find_res[sid];
155+
auto status = fr.status();
156+
if (status == OpStatus::KEY_NOTFOUND)
157+
continue;
158+
159+
if (status == OpStatus::WRONG_TYPE) {
160+
return status;
161+
}
162+
163+
CHECK(fr);
164+
165+
const auto& it_pos = fr.value();
166+
167+
size_t arg_indx = trans->ReverseArgIndex(sid, it_pos.second);
168+
if (arg_indx < min_arg_indx) {
169+
min_arg_indx = arg_indx;
170+
shard_result.sid = sid;
171+
172+
// we do not dereference the key, do not extract the string value, so it it
173+
// ok to just move it. We can not dereference it due to limitations of SmallString
174+
// that rely on thread-local data-structure for pointer translation.
175+
shard_result.key = it_pos.first.AsRef();
176+
}
177+
}
178+
179+
if (shard_result.sid == kInvalidSid) {
180+
return OpStatus::KEY_NOTFOUND;
181+
}
182+
183+
return OpResult<ShardFFResult>{move(shard_result)};
184+
}
185+
117186
class BPopper {
118187
public:
119188
explicit BPopper(ListDir dir);
@@ -122,22 +191,18 @@ class BPopper {
122191
// If OK is returned then use result() to fetch the value.
123192
OpStatus Run(Transaction* t, unsigned msec);
124193

194+
195+
// returns (key, value) pair.
125196
auto result() const {
126197
return make_pair<string_view, string_view>(key_, value_);
127198
}
128199

129-
bool found() const {
130-
return found_;
131-
}
132-
133200
private:
134201
OpStatus Pop(Transaction* t, EngineShard* shard);
135202

136203
ListDir dir_;
137204

138-
bool found_ = false;
139-
PrimeIterator find_it_;
140-
ShardId find_sid_ = std::numeric_limits<ShardId>::max();
205+
ShardFFResult ff_result_;
141206

142207
string key_;
143208
string value_;
@@ -158,7 +223,7 @@ OpStatus BPopper::Run(Transaction* t, unsigned msec) {
158223

159224
auto* stats = ServerState::tl_connection_stats();
160225

161-
OpResult<Transaction::FindFirstResult> result = t->FindFirst();
226+
OpResult<ShardFFResult> result = FindFirst(t);
162227

163228
if (result.status() == OpStatus::KEY_NOTFOUND) {
164229
if (is_multi) {
@@ -169,14 +234,16 @@ OpStatus BPopper::Run(Transaction* t, unsigned msec) {
169234
return OpStatus::TIMED_OUT;
170235
}
171236

237+
// Block
172238
++stats->num_blocked_clients;
173239
bool wait_succeeded = t->WaitOnWatch(tp);
174240
--stats->num_blocked_clients;
175241

176242
if (!wait_succeeded)
177243
return OpStatus::TIMED_OUT;
178244

179-
result = t->FindFirst(); // retry - must find something.
245+
// Now we have something for sure.
246+
result = FindFirst(t); // retry - must find something.
180247
}
181248

182249
if (!result) {
@@ -185,9 +252,7 @@ OpStatus BPopper::Run(Transaction* t, unsigned msec) {
185252
}
186253

187254
VLOG(1) << "Popping an element";
188-
find_sid_ = result->sid;
189-
find_it_ = result->find_res;
190-
found_ = true;
255+
ff_result_ = move(result.value());
191256

192257
auto cb = [this](Transaction* t, EngineShard* shard) { return Pop(t, shard); };
193258
t->Execute(std::move(cb), true);
@@ -196,18 +261,20 @@ OpStatus BPopper::Run(Transaction* t, unsigned msec) {
196261
}
197262

198263
OpStatus BPopper::Pop(Transaction* t, EngineShard* shard) {
199-
DCHECK(found());
200-
201-
if (shard->shard_id() == find_sid_) {
202-
find_it_->first.GetString(&key_);
264+
if (shard->shard_id() == ff_result_.sid) {
265+
ff_result_.key.GetString(&key_);
203266

204-
quicklist* ql = GetQL(find_it_->second);
267+
auto it_res = shard->db_slice().Find(t->db_index(), key_, OBJ_LIST);
268+
CHECK(it_res); // must exist and must be ok.
269+
PrimeIterator it = *it_res;
270+
quicklist* ql = GetQL(it->second);
205271
value_ = ListPop(dir_, ql);
206272

207273
if (quicklistCount(ql) == 0) {
208-
CHECK(shard->db_slice().Del(t->db_index(), find_it_));
274+
CHECK(shard->db_slice().Del(t->db_index(), it));
209275
}
210276
}
277+
211278
return OpStatus::OK;
212279
}
213280

@@ -242,9 +309,10 @@ void ListFamily::RPopLPush(CmdArgList args, ConnectionContext* cntx) {
242309
string_view dest = ArgS(args, 2);
243310

244311
OpResult<string> result;
245-
if (dest == src) {
312+
313+
if (cntx->transaction->unique_shard_cnt() == 1) {
246314
auto cb = [&](Transaction* t, EngineShard* shard) {
247-
return OpRPopLPushSingleKey(OpArgs{shard, t->db_index()}, src);
315+
return OpRPopLPushSingleShard(OpArgs{shard, t->db_index()}, src, dest);
248316
};
249317

250318
result = cntx->transaction->ScheduleSingleHopT(std::move(cb));
@@ -446,11 +514,12 @@ void ListFamily::BPopGeneric(ListDir dir, CmdArgList args, ConnectionContext* cn
446514
OpStatus result = popper.Run(transaction, unsigned(timeout * 1000));
447515

448516
if (result == OpStatus::OK) {
449-
CHECK(popper.found());
450-
VLOG(1) << "BLPop returned ";
451-
452517
auto res = popper.result();
518+
519+
VLOG(1) << "BLPop returned from " << res.first; // key.
520+
453521
std::string_view str_arr[2] = {res.first, res.second};
522+
454523
return (*cntx)->SendStringArr(str_arr);
455524
}
456525

@@ -550,7 +619,7 @@ OpResult<uint32_t> ListFamily::OpPush(const OpArgs& op_args, std::string_view ke
550619
} else {
551620
tie(it, new_key) = es->db_slice().AddOrFind(op_args.db_ind, key);
552621
}
553-
quicklist* ql;
622+
quicklist* ql = nullptr;
554623

555624
if (new_key) {
556625
robj* o = createQuicklistObject();
@@ -572,10 +641,12 @@ OpResult<uint32_t> ListFamily::OpPush(const OpArgs& op_args, std::string_view ke
572641
quicklistPush(ql, es->tmp_str1, sdslen(es->tmp_str1), pos);
573642
}
574643

575-
if (new_key && es->blocking_controller()) {
576-
string tmp;
577-
string_view key = it->first.GetSlice(&tmp);
578-
es->blocking_controller()->AwakeWatched(op_args.db_ind, key);
644+
if (new_key) {
645+
if (es->blocking_controller()) {
646+
string tmp;
647+
string_view key = it->first.GetSlice(&tmp);
648+
es->blocking_controller()->AwakeWatched(op_args.db_ind, key);
649+
}
579650
} else {
580651
es->db_slice().PostUpdate(op_args.db_ind, it);
581652
}
@@ -811,17 +882,54 @@ OpResult<StringVec> ListFamily::OpRange(const OpArgs& op_args, std::string_view
811882
return str_vec;
812883
}
813884

814-
OpResult<string> ListFamily::OpRPopLPushSingleKey(const OpArgs& op_args, std::string_view key) {
885+
OpResult<string> ListFamily::OpRPopLPushSingleShard(const OpArgs& op_args, string_view src,
886+
string_view dest) {
815887
auto& db_slice = op_args.shard->db_slice();
816-
auto it_res = db_slice.Find(op_args.db_ind, key, OBJ_LIST);
817-
if (!it_res)
818-
return it_res.status();
888+
auto src_res = db_slice.Find(op_args.db_ind, src, OBJ_LIST);
889+
if (!src_res)
890+
return src_res.status();
891+
892+
PrimeIterator src_it = *src_res;
893+
quicklist* src_ql = GetQL(src_it->second);
894+
895+
if (src == dest) { // simple case.
896+
db_slice.PreUpdate(op_args.db_ind, src_it);
897+
string val = ListPop(ListDir::RIGHT, src_ql);
898+
899+
quicklistPushHead(src_ql, val.data(), val.size());
900+
db_slice.PostUpdate(op_args.db_ind, src_it);
901+
902+
return val;
903+
}
904+
905+
quicklist* dest_ql = nullptr;
906+
auto [dest_it, created] = db_slice.AddOrFind(op_args.db_ind, dest);
907+
if (created) {
908+
robj* obj = createQuicklistObject();
909+
dest_ql = (quicklist*)obj->ptr;
910+
quicklistSetOptions(dest_ql, FLAGS_list_max_listpack_size, FLAGS_list_compress_depth);
911+
dest_it->second.ImportRObj(obj);
912+
913+
// Insertion of dest could invalidate src_it. Find it again.
914+
src_it = db_slice.GetTables(op_args.db_ind).first->Find(src);
915+
} else {
916+
if (dest_it->second.ObjType() != OBJ_LIST)
917+
return OpStatus::WRONG_TYPE;
918+
919+
dest_ql = GetQL(dest_it->second);
920+
db_slice.PreUpdate(op_args.db_ind, dest_it);
921+
}
922+
923+
db_slice.PreUpdate(op_args.db_ind, src_it);
924+
string val = ListPop(ListDir::RIGHT, src_ql);
925+
quicklistPushHead(dest_ql, val.data(), val.size());
926+
db_slice.PostUpdate(op_args.db_ind, src_it);
927+
db_slice.PostUpdate(op_args.db_ind, dest_it);
928+
929+
if (quicklistCount(src_ql) == 0) {
930+
CHECK(db_slice.Del(op_args.db_ind, src_it));
931+
}
819932

820-
PrimeIterator it = *it_res;
821-
quicklist* ql = GetQL(it->second);
822-
db_slice.PreUpdate(op_args.db_ind, it);
823-
string val = ListPop(ListDir::RIGHT, ql);
824-
quicklistPushHead(ql, val.data(), val.size());
825933
return val;
826934
}
827935

src/server/list_family.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ class ListFamily {
6363
static OpResult<StringVec> OpRange(const OpArgs& op_args, std::string_view key, long start,
6464
long end);
6565

66-
static OpResult<std::string> OpRPopLPushSingleKey(const OpArgs& op_args, std::string_view key);
66+
static OpResult<std::string> OpRPopLPushSingleShard(const OpArgs& op_args, std::string_view src,
67+
std::string_view dest);
6768
};
6869

6970
} // namespace dfly

0 commit comments

Comments
 (0)