Skip to content

Commit 7faa57d

Browse files
committed
Enhance galera to interact over multiple clusters
This change adds a new parameter "remote_node_map" to the Galera resource agent which allows it to consider galera node names that are in other clusters as part of its Galera quorum. To achieve this, it launches pcs commands over SSH to the remote clusters in order to view and modify remote state variables. WIP.
1 parent 9d9cd48 commit 7faa57d

File tree

1 file changed

+107
-13
lines changed

1 file changed

+107
-13
lines changed

heartbeat/galera

Lines changed: 107 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
##
2727
# README.
28-
#
28+
#
2929
# This agent only supports being configured as a multistate Master
3030
# resource.
3131
#
@@ -49,15 +49,15 @@
4949
# pcs resource create db galera enable_creation=true \
5050
# wsrep_cluster_address="gcomm://rhel7-auto1,rhel7-auto2,rhel7-auto3" meta master-max=3 --master
5151
#
52-
# By setting the 'enable_creation' option, the database will be automatically
52+
# By setting the 'enable_creation' option, the database will be automatically
5353
# generated at startup. The meta attribute 'master-max=3' means that all 3
5454
# nodes listed in the wsrep_cluster_address list will be allowed to connect
5555
# to the galera cluster and perform replication.
5656
#
5757
# NOTE: If you have more nodes in the pacemaker cluster then you wish
5858
# to have in the galera cluster, make sure to use location contraints to prevent
5959
# pacemaker from attempting to place a galera instance on a node that is
60-
# not in the 'wsrep_cluster_address" list.
60+
# not in the 'wsrep_cluster_address" list.
6161
#
6262
##
6363

@@ -68,6 +68,24 @@
6868
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
6969
. ${OCF_FUNCTIONS_DIR}/mysql-common.sh
7070

71+
72+
# FIXME: So at the moment, the proof of concept is using SSH to
73+
# get at the "other" cluster, and to authenticate I just have id_rsa keys
74+
# copied to all the controllers under "root", since that's who can run
75+
# pcs commands. So the RA on a docker container has root access to all
76+
# the controller nodes.
77+
#
78+
# So we would like to either:
79+
#
80+
# a. ssh as some other locked down account that can still run the
81+
# necessary pcs commands on the other cluster? or some setuid script that
82+
# does the things we need?
83+
#
84+
# b. totally other means of invoking pcs commands on remote cluster? web
85+
# service or something?
86+
#
87+
SSH_CMD="${SSH} -oStrictHostKeyChecking=no"
88+
7189
NODENAME=$(ocf_attribute_target)
7290

7391
# It is common for some galera instances to store
@@ -226,11 +244,26 @@ pcmk1:node.1.galera;pcmk2:node.2.galera;pcmk3:node.3.galera
226244
227245
where the galera resource started on node pcmk1 would be named
228246
node.1.galera in the wsrep_cluster_address
247+
229248
</longdesc>
230249
<shortdesc lang="en">Pacemaker to Galera name mapping</shortdesc>
231250
<content type="string" default=""/>
232251
</parameter>
233252
253+
<parameter name="remote_node_map" unique="0" required="0">
254+
<longdesc lang="en">
255+
A mapping of pacemaker node names to remote host SSH expressions.
256+
257+
Allows pacemaker nodes in remote pacemaker clusters to be part of this
258+
Galera cluster:
259+
260+
261+
262+
</longdesc>
263+
<shortdesc lang="en">Pacemaker to remote cluster nodes</shortdesc>
264+
<content type="string" default=""/>
265+
</parameter>
266+
234267
<parameter name="check_user" unique="0" required="0">
235268
<longdesc lang="en">
236269
Cluster check user.
@@ -283,7 +316,7 @@ set_bootstrap_node()
283316
{
284317
local node=$(ocf_attribute_target $1)
285318

286-
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -v "true"
319+
remote_crm_attribute $node -l reboot --name "${INSTANCE_ATTR_NAME}-bootstrap" -v "true"
287320
}
288321

289322
clear_bootstrap_node()
@@ -310,7 +343,7 @@ clear_no_grastate()
310343
is_no_grastate()
311344
{
312345
local node=$(ocf_attribute_target $1)
313-
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" --quiet 2>/dev/null
346+
remote_crm_attribute $node -l reboot --name "${INSTANCE_ATTR_NAME}-no-grastate" --quiet 2>/dev/null
314347
}
315348

316349
clear_last_commit()
@@ -329,8 +362,8 @@ get_last_commit()
329362

330363
if [ -z "$node" ]; then
331364
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null
332-
else
333-
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null
365+
else
366+
remote_crm_attribute $node -l reboot --name "${INSTANCE_ATTR_NAME}-last-committed" --quiet 2>/dev/null
334367
fi
335368
}
336369

@@ -351,7 +384,7 @@ get_safe_to_bootstrap()
351384
if [ -z "$node" ]; then
352385
${HA_SBIN_DIR}/crm_attribute -N $NODENAME -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null
353386
else
354-
${HA_SBIN_DIR}/crm_attribute -N $node -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null
387+
remote_crm_attribute $node -l reboot --name "${INSTANCE_ATTR_NAME}-safe-to-bootstrap" --quiet 2>/dev/null
355388
fi
356389
}
357390

@@ -410,16 +443,34 @@ master_exists()
410443
fi
411444
# determine if a master instance is already up and is healthy
412445
crm_mon --as-xml | grep "resource.*id=\"${INSTANCE_ATTR_NAME}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1
413-
return $?
446+
447+
local master_exists_local=$?
448+
449+
if [ $master_exists_local -eq 0 ]; then
450+
ocf_log info "Detected that a master exists for the local cluster"
451+
fi
452+
453+
# if not, and we have remote nodes, check those also
454+
if [ $master_exists_local -ne 0 ] && [ -n "$OCF_RESKEY_remote_node_map" ]; then
455+
for remote_ssh in $(echo "$OCF_RESKEY_remote_node_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '{print $2;}' | sort | uniq); do
456+
$SSH_CMD $remote_ssh crm_mon --as-xml | grep "resource.*id=\"${INSTANCE_ATTR_NAME}\".*role=\"Master\".*active=\"true\".*orphaned=\"false\".*failed=\"false\"" > /dev/null 2>&1
457+
if [ $? -eq 0 ]; then
458+
ocf_log info "Detected that a master exists for the remote cluster $remote_ssh"
459+
return $?
460+
fi
461+
done
462+
fi
463+
464+
return $master_exists_local
414465
}
415466

416467
clear_master_score()
417468
{
418469
local node=$(ocf_attribute_target $1)
419470
if [ -z "$node" ]; then
420471
$CRM_MASTER -D
421-
else
422-
$CRM_MASTER -D -N $node
472+
else
473+
remote_crm_master $node -D
423474
fi
424475
}
425476

@@ -429,8 +480,51 @@ set_master_score()
429480

430481
if [ -z "$node" ]; then
431482
$CRM_MASTER -v 100
432-
else
433-
$CRM_MASTER -N $node -v 100
483+
else
484+
remote_crm_master $node -v 100
485+
fi
486+
}
487+
488+
get_remote_node()
489+
{
490+
local node=$1
491+
if [ -z "$OCF_RESKEY_remote_node_map" ]; then
492+
return
493+
else
494+
local retval=$(echo "$OCF_RESKEY_remote_node_map" | tr ';' '\n' | tr -d ' ' | sed 's/:/ /' | awk -F' ' '$1=="'"$node"'" {print $2;exit}')
495+
if [ -z "$retval" ]; then
496+
return
497+
else
498+
echo $retval
499+
fi
500+
fi
501+
}
502+
503+
remote_crm_master()
504+
{
505+
local node=$1
506+
shift
507+
508+
local remote_ssh=$(get_remote_node $node)
509+
510+
if [ -z "$remote_ssh" ]; then
511+
$CRM_MASTER -N $node "$@"
512+
else
513+
$SSH_CMD $remote_ssh $CRM_MASTER -r ${INSTANCE_ATTR_NAME} -N $node "$@"
514+
fi
515+
}
516+
517+
remote_crm_attribute()
518+
{
519+
local node=$1
520+
shift
521+
522+
local remote_ssh=$(get_remote_node $node)
523+
524+
if [ -z "$remote_ssh" ]; then
525+
${HA_SBIN_DIR}/crm_attribute -N $node "$@"
526+
else
527+
$SSH_CMD $remote_ssh ${HA_SBIN_DIR}/crm_attribute -N $node "$@"
434528
fi
435529
}
436530

0 commit comments

Comments
 (0)