Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions bdb/bdb_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -2498,8 +2498,17 @@ int bdb_queuedb_has_seq(bdb_state_type *);
void dispatch_waiting_clients(void);

struct sqlclntstate;
int release_locks_int(const char *trace, const char *func, int line, struct sqlclntstate *);
#define release_locks(trace) release_locks_int(trace, __func__, __LINE__, NULL)
typedef enum {
RLOCKS_REASON_SI_LOCKWAIT, /* page lock waiter on SI/serial session */
RLOCKS_REASON_LOCKWAIT, /* page lock waiter on non-SI session */
RLOCKS_REASON_RANDOM, /* random release (testing) */
RLOCKS_REASON_LOCK_DESIRED, /* global BDB write lock desired */
RLOCKS_REASON_EMIT_ROW, /* waiters at row emit */
RLOCKS_REASON_LONG_REPWAIT, /* long rep wait at row emit */
RLOCKS_REASON_SLOW_READER, /* slow reader */
} rlocks_reason_t;
int release_locks_int(rlocks_reason_t reason, const char *func, int line, struct sqlclntstate *);
#define release_locks(reason) release_locks_int(reason, __func__, __LINE__, NULL)

int bdb_keylen(bdb_state_type *bdb_state, int ixnum);

Expand Down
4 changes: 2 additions & 2 deletions bdb/bdb_osqlcur.c
Original file line number Diff line number Diff line change
Expand Up @@ -463,11 +463,11 @@ int bdb_osql_update_shadows(bdb_cursor_ifn_t *pcur_ifn, bdb_osql_trn_t *trn,
logmsg(LOGMSG_WARN,
"%s: releasing locks while updating shadows\n",
__func__);
rc = release_locks("update shadows");
rc = release_locks(RLOCKS_REASON_SI_LOCKWAIT);
released_locks = 1;
} else if (gbl_sql_random_release_interval &&
!(rand() % gbl_sql_random_release_interval)) {
rc = release_locks("random release update shadows");
rc = release_locks(RLOCKS_REASON_RANDOM);
released_locks = 1;
}
if (rc != 0) {
Expand Down
2 changes: 2 additions & 0 deletions db/db_tunables.c
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,8 @@ extern int gbl_create_default_user;
extern int gbl_allow_neg_column_size;
extern int gbl_client_heartbeat_ms;
extern int gbl_rep_wait_release_ms;
extern int gbl_debug_sleep_in_cursor_move;
extern int gbl_recover_deadlock_sync_dta;
extern int gbl_rep_wait_core_ms;
extern int gbl_random_get_curtran_failures;
extern int gbl_txn_fop_noblock;
Expand Down
7 changes: 7 additions & 0 deletions db/db_tunables.h
Original file line number Diff line number Diff line change
Expand Up @@ -1742,6 +1742,13 @@ REGISTER_TUNABLE("client_heartbeat_ms",
TUNABLE_INTEGER, &gbl_client_heartbeat_ms,
EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, NULL);

REGISTER_TUNABLE("debug_sleep_in_cursor_move", "Sleep N ms on each cursor move (testing only). (Default: 0)",
TUNABLE_INTEGER, &gbl_debug_sleep_in_cursor_move, EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, NULL);

REGISTER_TUNABLE("recover_deadlock_sync_dta",
"Sync index/data cursors before lock release in recover_deadlock. (Default: 0)", TUNABLE_BOOLEAN,
&gbl_recover_deadlock_sync_dta, EXPERIMENTAL | INTERNAL, NULL, NULL, NULL, NULL);

REGISTER_TUNABLE("rep_release_wait_ms",
"Release sql-locks if rep-thd is blocked for this many ms."
" (Default: 60000)",
Expand Down
2 changes: 2 additions & 0 deletions db/sql.h
Original file line number Diff line number Diff line change
Expand Up @@ -1286,6 +1286,7 @@ struct BtCursor {
void *query_preparer_data;

int permissions; /* permissions for read/write access to table */
BtCursor *pCursorHintTableCursor;
};

struct sql_hist {
Expand Down Expand Up @@ -1494,6 +1495,7 @@ int sqlite3LockStmtTables(sqlite3_stmt *pStmt);
int sqlite3UnlockStmtTablesRemotes(struct sqlclntstate *clnt);
void sql_remote_schema_changed(struct sqlclntstate *clnt, sqlite3_stmt *pStmt);
int release_locks_on_emit_row(struct sqlclntstate *clnt);
void sync_index_data_cursors(struct sql_thread *thd);

void clearClientSideRow(struct sqlclntstate *clnt);
struct temptable get_tbl_by_rootpg(const sqlite3 *, int);
Expand Down
92 changes: 87 additions & 5 deletions db/sqlglue.c
Original file line number Diff line number Diff line change
Expand Up @@ -3248,6 +3248,54 @@ static inline int sqlite3VdbeCompareRecordPacked(KeyInfo *pKeyInfo, int k1len,
}

unsigned long long release_locks_on_si_lockwait_cnt = 0;
int gbl_debug_sleep_in_cursor_move = 0; /* ms to sleep on each cursor move (testing only) */
int gbl_recover_deadlock_sync_dta = 0; /* sync index/data cursors before lock release */

/*
* Before releasing cursor locks, walk all open index cursors and sync each
* one's paired data cursor (set via BTREE_HINT_TABLECURSOR) to the index
* cursor's current genid if they differ. Must be called while locks are
* still held so the target row is guaranteed to exist at find time.
*/
void sync_index_data_cursors(struct sql_thread *thd)
{
BtCursor *cur;

Pthread_mutex_lock(&thd->lk);
if (!thd->bt)
goto done;

LISTC_FOR_EACH(&thd->bt->cursors, cur, lnk)
{
BtCursor *dta = cur->pCursorHintTableCursor;
if (!dta || !cur->bdbcur || !dta->bdbcur)
continue;
if (cur->cursor_class != CURSORCLASS_INDEX)
continue;
unsigned long long idx_genid = cur->genid;
if (idx_genid == 0 || idx_genid == dta->genid)
continue;
/* data cursor hasn't caught up to the index cursor -- find it now.
* use bdbcur->find directly (not ddguard) to avoid re-entering
* recover_deadlock which also needs thd->lk */
int bdberr;
int rc = dta->bdbcur->find(dta->bdbcur, &idx_genid, sizeof(idx_genid), 0, &bdberr);
if (rc != IX_FND)
continue;
int fndlen;
void *buf;
uint8_t ver;
dta->bdbcur->get_found_data(dta->bdbcur, &dta->rrn, &dta->genid, &fndlen, &buf, &ver);
vtag_to_ondisk(dta->db, buf, &fndlen, ver, dta->genid);
dta->ondisk_buf = buf;
dta->dtabuf = buf;
dta->dtabuflen = fndlen;
}

done:
Pthread_mutex_unlock(&thd->lk);
}

/* Release pagelocks if the replicant is waiting on this sql thread */
static int cursor_move_postop(BtCursor *pCur)
{
Expand All @@ -3257,16 +3305,21 @@ static int cursor_move_postop(BtCursor *pCur)
extern int gbl_locks_check_waiters;
int rc = 0;

if (gbl_locks_check_waiters && gbl_sql_release_locks_on_si_lockwait &&
(clnt->dbtran.mode == TRANLEVEL_SNAPISOL ||
clnt->dbtran.mode == TRANLEVEL_SERIAL)) {
if (gbl_debug_sleep_in_cursor_move)
poll(NULL, 0, gbl_debug_sleep_in_cursor_move);

if (gbl_locks_check_waiters) {
extern int gbl_sql_random_release_interval;
int is_si = (clnt->dbtran.mode == TRANLEVEL_SNAPISOL || clnt->dbtran.mode == TRANLEVEL_SERIAL);
if (is_si && !gbl_sql_release_locks_on_si_lockwait)
return 0;
rlocks_reason_t reason = is_si ? RLOCKS_REASON_SI_LOCKWAIT : RLOCKS_REASON_LOCKWAIT;
if (bdb_curtran_has_waiters(thedb->bdb_env, clnt->dbtran.cursor_tran)) {
rc = release_locks("replication is waiting on si-session");
rc = release_locks(reason);
release_locks_on_si_lockwait_cnt++;
} else if (gbl_sql_random_release_interval &&
!(rand() % gbl_sql_random_release_interval)) {
rc = release_locks("random release cursor_move_postop");
rc = release_locks(RLOCKS_REASON_RANDOM);
release_locks_on_si_lockwait_cnt++;
}
}
Expand Down Expand Up @@ -5986,6 +6039,17 @@ int sqlite3BtreeMovetoUnpacked(BtCursor *pCur, /* The cursor to be moved */
}
}

/* If the cursor was pre-synced by sync_index_data_cursors before a
* lock release, the cursor is already positioned at the requested
* genid and the data is in the cursor's buffers. Skip the BDB seek
* to avoid a "Dta lookup lost the race" failure if the row was deleted
* during the lock-release window. */
if (gbl_recover_deadlock_sync_dta && bias == OP_DeferredSeek && pCur->genid == (unsigned long long)intKey &&
!pCur->eof) {
*pRes = 0;
goto done;
}

/* TODO: we already found the data record. find some way to map between
* index/data cursors and don't do extra data fetches unless we
* move the cursor */
Expand Down Expand Up @@ -11658,6 +11722,14 @@ const char *comdb2_get_sql(void)
}

int gbl_fdb_track_hints = 0;
static void sqlite3BtreeCursorHint_TableCursor(BtCursor *pCur, BtCursor *pTableCsr)
{
assert(pCur->cursor_class == CURSORCLASS_INDEX);
assert(pTableCsr->cursor_class == CURSORCLASS_TABLE);
assert(pCur->db == pTableCsr->db);
pCur->pCursorHintTableCursor = pTableCsr;
}

static void sqlite3BtreeCursorHint_Range(BtCursor *pCur, const Expr *pExpr)
{
char *expr = "?no vdbe engine?";
Expand Down Expand Up @@ -11713,10 +11785,20 @@ void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...)

break;
}

case BTREE_HINT_TABLECURSOR: {
sqlite3BtreeCursorHint_TableCursor(pCur, va_arg(ap, BtCursor *));
break;
}
}
va_end(ap);
}

BtCursor *sqlite3BtreeCursorHintTblCsr(BtCursor *pCsr)
{
return pCsr->pCursorHintTableCursor;
}

int fdb_packedsqlite_extract_genid(char *key, int *outlen, char *outbuf)
{
int hdroffset = 0;
Expand Down
41 changes: 33 additions & 8 deletions db/sqlinterfaces.c
Original file line number Diff line number Diff line change
Expand Up @@ -2828,18 +2828,43 @@ static int check_thd_gen(struct sqlthdstate *thd, struct sqlclntstate *clnt, int
return SQLITE_OK;
}

int release_locks_int(const char *trace, const char *func, int line, struct sqlclntstate *clnt)
static const char *rlocks_reason_str(rlocks_reason_t reason)
{
switch (reason) {
case RLOCKS_REASON_SI_LOCKWAIT:
return "replication is waiting on si-session";
case RLOCKS_REASON_LOCKWAIT:
return "replication is waiting on session";
case RLOCKS_REASON_RANDOM:
return "random release";
case RLOCKS_REASON_LOCK_DESIRED:
return "release locks on emit-row for lock-desired";
case RLOCKS_REASON_EMIT_ROW:
return "release locks on emit-row";
case RLOCKS_REASON_LONG_REPWAIT:
return "long repwait at emit-row";
case RLOCKS_REASON_SLOW_READER:
return "slow reader";
default:
return "unknown";
}
}

int release_locks_int(rlocks_reason_t reason, const char *func, int line, struct sqlclntstate *clnt)
{
struct sql_thread *thd = pthread_getspecific(query_info_key);
if (!clnt) {
struct sql_thread *thd = pthread_getspecific(query_info_key);
if (thd) clnt = thd->clnt;
}
if (!clnt || !clnt->dbtran.cursor_tran) return -1;
extern int gbl_sql_release_locks_trace;
if (gbl_sql_release_locks_trace) {
logmsg(LOGMSG_USER, "Releasing locks for lockid %d, %s\n",
bdb_get_lid_from_cursortran(clnt->dbtran.cursor_tran), trace);
bdb_get_lid_from_cursortran(clnt->dbtran.cursor_tran), rlocks_reason_str(reason));
}
extern int gbl_recover_deadlock_sync_dta;
if (reason == RLOCKS_REASON_LOCKWAIT && gbl_recover_deadlock_sync_dta && thd)
sync_index_data_cursors(thd);
return recover_deadlock_flags(thedb->bdb_env, clnt, NULL, -1, func, line, 0);
}

Expand All @@ -2856,7 +2881,7 @@ int release_locks_on_emit_row(struct sqlclntstate *clnt)

/* Always release if we're emitting during a master change */
if (bdb_lock_desired(thedb->bdb_env))
return release_locks_int("release locks on emit-row for lock-desired", __func__, __LINE__, clnt);
return release_locks_int(RLOCKS_REASON_LOCK_DESIRED, __func__, __LINE__, clnt);

/* Short circuit if check-waiters or tunable is disabled */
if (!gbl_locks_check_waiters)
Expand All @@ -2868,20 +2893,20 @@ int release_locks_on_emit_row(struct sqlclntstate *clnt)
/* Release locks randomly for testing */
if (gbl_sql_random_release_interval &&
!(rand() % gbl_sql_random_release_interval))
return release_locks_int("random release emit-row", __func__, __LINE__, clnt);
return release_locks_int(RLOCKS_REASON_RANDOM, __func__, __LINE__, clnt);

/* Short circuit if we don't have any waiters */
if (!bdb_curtran_has_waiters(thedb->bdb_env, clnt->dbtran.cursor_tran))
return 0;

/* We're emitting a row & have waiters */
if (!gbl_rep_wait_release_ms || thedb->master == gbl_myhostname)
return release_locks_int("release locks on emit-row", __func__, __LINE__, clnt);
return release_locks_int(RLOCKS_REASON_EMIT_ROW, __func__, __LINE__, clnt);

/* We're emitting a row and are blocking replication */
if (rep_lock_time_ms &&
(comdb2_time_epochms() - rep_lock_time_ms) > gbl_rep_wait_release_ms)
return release_locks_int("long repwait at emit-row", __func__, __LINE__, clnt);
return release_locks_int(RLOCKS_REASON_LONG_REPWAIT, __func__, __LINE__, clnt);

return 0;
}
Expand Down Expand Up @@ -5663,7 +5688,7 @@ static int recover_deadlock_sbuf(struct sqlclntstate *clnt)

/* Sql thread */
if (thd) {
if (release_locks_int("slow reader", __func__, __LINE__, clnt) != 0) {
if (release_locks_int(RLOCKS_REASON_SLOW_READER, __func__, __LINE__, clnt) != 0) {
assert(bdb_lockref() == 0);
logmsg(LOGMSG_ERROR, "%s release_locks failed\n", __func__);
return 1;
Expand Down
2 changes: 1 addition & 1 deletion lua/sp.c
Original file line number Diff line number Diff line change
Expand Up @@ -5512,7 +5512,7 @@ static int l_send_back_row(Lua lua, sqlite3_stmt *stmt, int nargs)
* air to check if bdb_lock_desired */
while ((rc = pthread_mutex_trylock(parent->emit_mutex)) == EBUSY) {
if (bdb_lock_desired(thedb->bdb_env)) {
rc = release_locks("release locks on emit-row for lock-desired");
rc = release_locks(RLOCKS_REASON_LOCK_DESIRED);
if (rc) {
logmsg(LOGMSG_ERROR, "%s release_locks_on_emit_row %d\n", __func__, rc);
return rc;
Expand Down
2 changes: 2 additions & 0 deletions sqlite/src/sqlite_btree.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ int sqlite3BtreeNewDb(Btree *p);
*/
#define BTREE_HINT_FLAGS 1 /* Set flags indicating cursor usage */
#define BTREE_HINT_RANGE 2 /* Range constraints on queries */
#define BTREE_HINT_TABLECURSOR 3 /* Table csr associated with this index csr */

/*
** Values that may be OR'd together to form the second argument to the
Expand Down Expand Up @@ -230,6 +231,7 @@ int sqlite3BtreeNewDb(Btree *p);

#ifdef SQLITE_ENABLE_CURSOR_HINTS
void sqlite3BtreeCursorHint(BtCursor*, int, ...);
BtCursor *sqlite3BtreeCursorHintTblCsr(BtCursor*);
#endif

#define BTREE_CUR_RD 0x00000001
Expand Down
33 changes: 25 additions & 8 deletions sqlite/src/vdbe.c
Original file line number Diff line number Diff line change
Expand Up @@ -6563,6 +6563,9 @@ case OP_IdxRowid: { /* out2 */
assert( pTabCur->eCurType==CURTYPE_BTREE );
assert( pTabCur->uc.pCursor!=0 );
assert( pTabCur->isTable );
assert(
sqlite3BtreeCursorHintTblCsr(pC->uc.pCursor)==pTabCur->uc.pCursor
);
pTabCur->nullRow = 0;
pTabCur->movetoTarget = rowid;
pTabCur->deferredMoveto = 1;
Expand Down Expand Up @@ -8661,25 +8664,39 @@ case OP_Init: { /* jump */
}

#ifdef SQLITE_ENABLE_CURSOR_HINTS
/* Opcode: CursorHint P1 * * P4 *
/* Opcode: CursorHint P1 * P3 P4 *
**
** Provide a hint to cursor P1.
**
** Provide a hint to cursor P1 that it only needs to return rows that
** satisfy the Expr in P4. TK_REGISTER terms in the P4 expression refer
** to values currently held in registers. TK_COLUMN terms in the P4
** If P4 is of type P4_EXPR, then the hint is that the cursor need only return
** rows that satisfy the Expr in P4. TK_REGISTER terms in the P4 expression
** refer to values currently held in registers. TK_COLUMN terms in the P4
** expression refer to columns in the b-tree to which cursor P1 is pointing.
** P3 is ignored in this case.
**
** Or, if P4 is P4_NOTUSED, then the hint is that cursor P1 is an index cursor
** used to drive table cursor P3. In other words, that this VM may execute
** OP_DeferredSeek instructions to lazily position P3 based on current
** position of P1.
*/
case OP_CursorHint: {
VdbeCursor *pC;
pC = p->apCsr[pOp->p1];

assert( pOp->p1>=0 && pOp->p1<p->nCursor );
assert( pOp->p4type==P4_EXPR );
pC = p->apCsr[pOp->p1];

if( pC ){
#if !defined(SQLITE_BUILDING_FOR_COMDB2)
assert( pC->eCurType==CURTYPE_BTREE );
#endif /* !defined(SQLITE_BUILDING_FOR_COMDB2) */
sqlite3BtreeCursorHint(pC->uc.pCursor, BTREE_HINT_RANGE,
pOp->p4.pExpr, aMem);
if( pOp->p4type==P4_EXPR ){
sqlite3BtreeCursorHint(pC->uc.pCursor, BTREE_HINT_RANGE,
pOp->p4.pExpr, aMem);
}else if( p->apCsr[pOp->p3] ){
sqlite3BtreeCursorHint(
pC->uc.pCursor, BTREE_HINT_TABLECURSOR, p->apCsr[pOp->p3]->uc.pCursor
);
}
}
break;
}
Expand Down
2 changes: 1 addition & 1 deletion sqlite/src/vdbeaux.c
Original file line number Diff line number Diff line change
Expand Up @@ -3597,7 +3597,7 @@ int SQLITE_NOINLINE sqlite3VdbeFinishMoveto(VdbeCursor *p){
snprintf(errmsg, sizeof(errmsg),
"Dta lookup lost the race for tbl %s genid=%llu (%llx) [new]\n",
sqlite3BtreeGetTblName(p->uc.pCursor),
bdb_genid_to_host_order(p->movetoTarget),
bdb_genid_to_host_order(p->movetoTarget),
bdb_genid_to_host_order(p->movetoTarget));
logmsg(LOGMSG_ERROR, "%s\n", errmsg);
if (gbl_abort_on_dta_lookup_error)
Expand Down
Loading
Loading