Skip to content

Commit dd2bf39

Browse files
Fix potential races in Pthread gather/scatter. (#297)
This likely isn't an ideal fix, but it appears to address the issues in qvi_hwsplit_coll::split() that I was seeing earlier. Signed-off-by: Samuel K. Gutierrez <[email protected]>
1 parent c406510 commit dd2bf39

File tree

2 files changed

+20
-15
lines changed

2 files changed

+20
-15
lines changed

src/qvi-pthread.cc

+20-11
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,6 @@ qvi_pthread_group::m_subgroup_info(
192192
int rc = QV_SUCCESS;
193193
const int master_rank = 0;
194194
const int my_rank = rank();
195-
// TODO(skg)
196-
//qvi_log_debug("color={}, key={}, rank={}", color, key, my_rank);
197195
// Gather colors and keys from ALL threads in the parent group.
198196
// NOTE: this (i.e., the caller) is a member of the parent group).
199197
m_ckrs[my_rank].color = color;
@@ -277,9 +275,10 @@ qvi_pthread_group::split(
277275
const qvi_group_id_t mygid = m_subgroup_gids[sginfo.index];
278276
ichild = m_context->groupid2thgroup.at(mygid);
279277
}
278+
// Now we can check if errors happened above.
279+
if (qvi_unlikely(rc != QV_SUCCESS)) goto out;
280280

281281
rc = m_finish_init_by_all_threads(ichild);
282-
283282
out:
284283
if (qvi_unlikely(rc != QV_SUCCESS)) {
285284
qvi_delete(&ichild);
@@ -295,16 +294,22 @@ qvi_pthread_group::gather(
295294
qvi_bbuff_alloc_type_t *alloc_type,
296295
qvi_bbuff ***rxbuffs
297296
) {
298-
const int rc = qvi_bbuff_copy(*txbuff, m_data_g[rank()]);
297+
const int myrank = rank();
298+
// I'm not sure why this barrier is needed, but it seems to help...
299+
barrier();
300+
int rc = QV_SUCCESS;
301+
{
302+
std::lock_guard<std::mutex> guard(m_mutex);
303+
rc = qvi_bbuff_copy(*txbuff, m_data_g[myrank]);
304+
*alloc_type = QVI_BBUFF_ALLOC_SHARED_GLOBAL;
305+
}
299306
// Need to ensure that all threads have contributed to m_data_g
300-
pthread_barrier_wait(&m_barrier);
301-
*alloc_type = QVI_BBUFF_ALLOC_SHARED_GLOBAL;
307+
barrier();
302308

303309
if (qvi_unlikely(rc != QV_SUCCESS)) {
304310
*rxbuffs = nullptr;
305311
return QV_ERR_INTERNAL;
306312
}
307-
308313
*rxbuffs = m_data_g;
309314
return rc;
310315
}
@@ -315,16 +320,20 @@ qvi_pthread_group::scatter(
315320
int rootid,
316321
qvi_bbuff **rxbuff
317322
) {
323+
int rc = QV_SUCCESS;
318324
const int myrank = rank();
325+
qvi_bbuff *mybbuff = nullptr;
319326

320327
if (rootid == myrank) {
321328
*m_data_s = txbuffs;
322329
}
323-
pthread_barrier_wait(&m_barrier);
324330

325-
qvi_bbuff *mybbuff = nullptr;
326-
const int rc = qvi_bbuff_dup( *((*m_data_s)[myrank]), &mybbuff);
327-
pthread_barrier_wait(&m_barrier);
331+
barrier();
332+
{
333+
std::lock_guard<std::mutex> guard(m_mutex);
334+
rc = qvi_bbuff_dup(*((*m_data_s)[myrank]), &mybbuff);
335+
}
336+
barrier();
328337

329338
if (qvi_unlikely(rc != QV_SUCCESS)) {
330339
qvi_bbuff_delete(&mybbuff);

src/qvi-scope.cc

-4
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,6 @@ qv_scope::split(
237237
);
238238
rc = chwsplit.split(&colorp, &hwpool);
239239
if (rc != QV_SUCCESS) goto out;
240-
// TODO(skg) In the threaded case it looks like there is a race here or
241-
// something else is wrong. See colorp.
242-
//qvi_log_debug("SCOPE SPLIT npieces={}, color={} colorp={}", npieces, color, colorp);
243-
244240
// Split underlying group. Notice the use of colorp here.
245241
rc = m_group->split(
246242
colorp, m_group->rank(), &group

0 commit comments

Comments
 (0)