Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions db/toblock.c
Original file line number Diff line number Diff line change
Expand Up @@ -6376,10 +6376,37 @@ static int toblock_main(struct javasp_trans_state *javasp_trans_handle, struct i
}

if (iq->tptlock) {
/*
* Lock ordering problem: this path acquires views_lk(rd) here, then
* bplog_schemachange_run() acquires schema_lk(wr) deep inside
* toblock_main_int(). That is the reverse of the canonical order
* (schema_lk -> views_lk) used by the time-partition cron threads
* (_view_cron_phase1/2/3) and by views_cron_restart().
*
* If a transaction holds both a BPFUNC for time-partition retention
* (sets tptlock, acquires views_lk(rd)) and a DDL schema change (sets
* tranddl, acquires schema_lk(wr) via bplog_schemachange_run), the
* following deadlock can occur:
*
* Thread A (this block processor):
* holds views_lk(rd), waits for schema_lk(wr)
*
* Thread B (cron phase1/2/3 or any SC path):
* holds schema_lk(wr or rd), waits for views_lk(wr)
*
* Thread A cannot get schema_lk(wr) because Thread B holds it.
* Thread B cannot get views_lk(wr) because Thread A holds views_lk(rd)
* (readers block writers).
*
* The proper fix requires either narrowing the views_lk scope inside
* toblock_main_int() or hoisting schema_lk acquisition to before
* views_lock() — both require significant refactoring.
*
* Until then, we prevent the deadlock by rejecting any transaction that
* sets both tptlock (BPFUNC TIMEPART_RETENTION) and tranddl (DDL schema
* changes) in the same transaction.
*/
if (iq->tranddl) {
/* avoid a lock inversion here; simply forbit mixing bpfunc that
* requires tpt lock with other schema changes
*/
reqlog_set_error(
iq->reqlogger,
"Error cannot mix tpt retention with other schema changes",
Expand Down
30 changes: 15 additions & 15 deletions db/views.c
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,7 @@ void *_view_cron_phase2(struct cron_event *event, struct errstat *err)

if (run) {
bdb_thread_event(thedb->bdb_env, BDBTHR_EVENT_START);
BDB_READLOCK(__func__);
rdlock_schema_lk();
Pthread_rwlock_wrlock(&views_lk);

Expand All @@ -1128,7 +1129,7 @@ void *_view_cron_phase2(struct cron_event *event, struct errstat *err)
(arg1) ? (char *)arg1 : "NULL", arg2,
(arg2) ? (char *)arg2 : "NULL", arg3);

/* this is a safeguard! we take effort to schedule cleanup of
/* this is a safeguard! we take effort to schedule cleanup of
a dropped partition ahead of everything, but jic ! */
if (unlikely(
_validate_view_id(view, event->source_id, "phase 2", err))) {
Expand All @@ -1147,8 +1148,6 @@ void *_view_cron_phase2(struct cron_event *event, struct errstat *err)
/* lets save this */
timeCrtRollout = view->roll_time;

BDB_READLOCK(__func__);

rc = _views_rollout_phase2(view, pShardName, &timeNextRollout,
&removeShardName, err);

Expand All @@ -1158,14 +1157,11 @@ void *_view_cron_phase2(struct cron_event *event, struct errstat *err)
thedb->bdb_env, view->name,
0 /*not sure it is safe here to wait, so don't*/, &bdberr);
if (rc != 0) {
logmsg(LOGMSG_ERROR,
"%s -- bdb_llog_views view %s rc:%d bdberr:%d\n",
__func__, view->name, rc, bdberr);
logmsg(LOGMSG_ERROR, "%s -- bdb_llog_views view %s rc:%d bdberr:%d\n", __func__, view->name, rc,
bdberr);
}
}

BDB_RELLOCK();

/* tell the world */
gbl_views_gen++;
view->version = gbl_views_gen;
Expand All @@ -1178,6 +1174,7 @@ void *_view_cron_phase2(struct cron_event *event, struct errstat *err)
Pthread_rwlock_unlock(&views_lk);
unlock_schema_lk();
csc2_free_all();
BDB_RELLOCK();
bdb_thread_event(thedb->bdb_env, BDBTHR_EVENT_DONE);

/* schedule next */
Expand Down Expand Up @@ -1937,9 +1934,16 @@ int views_cron_restart(timepart_views_t *views)
we will requeue them */
cron_clear_queue(timepart_sched);

/* Acquire BDB replock before views_lk to match the ordering used in
_view_cron_phase1/3 (BDB_READLOCK -> views_lk). Consistent ordering
across all callers avoids a potential deadlock if the bdb_lock is ever
initialized with writer preference, and makes lock auditing simpler. */
bdb_thread_event(thedb->bdb_env, BDBTHR_EVENT_START);
BDB_READLOCK(__func__);

/* corner case: master started and schema change for time partition
submitted before watchdog thread has time to restart it, will deadlock
if this is the case, abort the schema change */
submitted before watchdog thread has time to restart it; abort the
schema change so views_cron_restart does not wait indefinitely */
rc = pthread_rwlock_trywrlock(&views_lk);
if (rc == EBUSY) {
if (get_schema_change_in_progress(__func__, __LINE__)) {
Expand All @@ -1953,9 +1957,6 @@ int views_cron_restart(timepart_views_t *views)
abort();
}

bdb_thread_event(thedb->bdb_env, BDBTHR_EVENT_START);
BDB_READLOCK(__func__);

if (thedb->master == gbl_myhostname && !gbl_is_physical_replicant) {
/* queue all the events required for this */
for(i=0;i<views->nviews; i++)
Expand Down Expand Up @@ -1984,9 +1985,8 @@ int views_cron_restart(timepart_views_t *views)
rc = VIEW_NOERR;

done:
BDB_RELLOCK();

Pthread_rwlock_unlock(&views_lk);
BDB_RELLOCK();
bdb_thread_event(thedb->bdb_env, BDBTHR_EVENT_DONE);
return rc;
}
Expand Down
Loading