Skip to content

Commit 12596c2

Browse files
committed
tests: ipsec: Check that nodes can ping each other in the NxN test.
Expand the NxN test with the network connectivity check between all the nodes. Unfortunately, we can't really run this test with Libreswan 4.x, since, due to internal issues in these versions, we are getting into states where everything is loaded and active, but no traffic can pass. This is an internal issue in Libreswan that we can't workaround from the outside. So, the fix is required in Libreswan itself. 4.5 and earlier versions seem to not be affected by this problem, at least not severely affected, but it's easier to just cut off all the 4.x versions from the test. 3.32 version from Ubuntu 22.04 and Libreswna 5.1 work just fine with this test. Test is relatively long, but it is very valuable, IMO. Besides stressing ovs-monitor-ipsec with various failure and asynchronous connection establishment conditions, which are important for OVS, it also was used to reproduce and fix several bugs in Libreswan 4.x. Unfortunately, not all the issues are understood and fixed yet. Acked-by: Eelco Chaudron <[email protected]> Signed-off-by: Ilya Maximets <[email protected]>
1 parent 55bf106 commit 12596c2

File tree

1 file changed

+76
-8
lines changed

1 file changed

+76
-8
lines changed

tests/system-ipsec.at

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ m4_define([IPSEC_ADD_NODE],
7171
on_exit "kill `cat $ovs_base/$1/ovs-monitor-ipsec.pid`"
7272

7373
dnl Set up OVS bridge
74-
NS_EXEC([$1], [ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec])]
74+
NS_CHECK_EXEC([$1],
75+
[ovs-vsctl --db unix:$ovs_base/$1/db.sock add-br br-ipsec \
76+
-- set-controller br-ipsec punix:$ovs_base/br-ipsec.$1.mgmt])]
7577
)
7678
m4_define([IPSEC_ADD_NODE_LEFT], [IPSEC_ADD_NODE(left, p0, $1, $2)])
7779
m4_define([IPSEC_ADD_NODE_RIGHT], [IPSEC_ADD_NODE(right, p1, $1, $2)])
@@ -429,7 +431,8 @@ m4_for([id], [1], NODES, [1], [
429431
self-sign node-id], [0], [stdout])
430432
AT_CHECK(OVS_VSCTL([node-id], set Open_vSwitch . \
431433
other_config:certificate=${ovs_base}/node-id-cert.pem \
432-
other_config:private_key=${ovs_base}/node-id-privkey.pem),
434+
other_config:private_key=${ovs_base}/node-id-privkey.pem \
435+
-- set bridge br-ipsec other-config:hwaddr=f2:ff:00:00:00:id),
433436
[0], [ignore], [ignore])
434437
on_exit "ipsec --rundir $ovs_base/node-id status > $ovs_base/node-id/status"
435438
])
@@ -445,11 +448,18 @@ m4_for([LEFT], [1], NODES, [1], [
445448
fi
446449
])])
447450

451+
dnl These are not necessary, but nice to have in the test log in
452+
dnl order to spot pluto failures during the test.
453+
on_exit "grep -E 'Timed out|outdated|half-loaded|defunct' \
454+
$ovs_base/node-*/ovs-monitor-ipsec.log"
455+
on_exit "grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log"
456+
448457
m4_define([WAIT_FOR_LOADED_CONNS], [
449458
m4_for([id], [1], NODES, [1], [
450459
echo "================== node-id ========================="
451460
iterations=0
452461
loaded=0
462+
active=0
453463
dnl Using a custom loop instead of OVS_WAIT_UNTIL, because it may take
454464
dnl much longer than a default timeout. The default retransmit timeout
455465
dnl for pluto is 60 seconds. Also, we need to make sure pluto didn't
@@ -463,8 +473,11 @@ m4_define([WAIT_FOR_LOADED_CONNS], [
463473
START_PLUTO([node-id])
464474
else
465475
loaded=$(IPSEC_STATUS_LOADED(node-id))
476+
m4_if([$1], [active],
477+
[active=$(IPSEC_STATUS_ACTIVE(node-id))], [active=$loaded])
466478
fi
467-
if test "$loaded" -ne $(( (NODES - 1) * 2 )); then
479+
if test "$loaded" -ne "$(( (NODES - 1) * 2 ))" -o \
480+
"$loaded" -ne "$active"; then
468481
sleep 3
469482
else
470483
break
@@ -505,11 +518,66 @@ OVS_WAIT_UNTIL([grep -q 'tun-2.*need to reconcile' \
505518
dnl Wait for all the connections to be loaded back.
506519
WAIT_FOR_LOADED_CONNS()
507520

508-
dnl These are not necessary, but nice to have in the test log in
509-
dnl order to spot pluto failures during the test.
510-
grep -E 'Timed out|outdated|half-loaded|defunct' \
511-
$ovs_base/node-*/ovs-monitor-ipsec.log
512-
grep -E 'ABORT|ERROR' $ovs_base/node-*/pluto.log
521+
dnl Next section will check connectivity between all the nodes.
522+
dnl Different versions of Libreswan 4.x have issues where connections
523+
dnl are not being correctly established or never become active in a
524+
dnl way that can not be mitigated from ovs-monitor-ipsec or the test.
525+
dnl So, only checking connectivity for Libreswan 3- or 5+.
526+
dnl Skipping in the middle of the test, so test can still fail while
527+
dnl testing with Libreswan 4, if the first half fails.
528+
AT_SKIP_IF([ipsec --version 2>&1 | grep -q 'Libreswan 4\.'])
529+
530+
dnl Turn off IPv6 and add static ARP entries for all namespaces to avoid
531+
dnl any broadcast / multicast traffic that would otherwise be multiplied
532+
dnl by each node creating a traffic storm. Add specific OpenFlow rules
533+
dnl to forward traffic to exact destinations without any MAC learning.
534+
m4_for([LEFT], [1], NODES, [1], [
535+
NS_CHECK_EXEC([node-LEFT], [sysctl -w net.ipv6.conf.all.disable_ipv6=1],
536+
[0], [ignore])
537+
AT_CHECK([ovs-ofctl del-flows unix:$ovs_base/br-ipsec.node-LEFT.mgmt])
538+
AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
539+
"dl_dst=f2:ff:00:00:00:LEFT actions=LOCAL"])
540+
m4_for([RIGHT], [1], NODES, [1], [
541+
if test LEFT -ne RIGHT; then
542+
NS_CHECK_EXEC([node-LEFT],
543+
[ip neigh add 192.0.0.RIGHT lladdr f2:ff:00:00:00:RIGHT dev br-ipsec])
544+
AT_CHECK([ovs-ofctl add-flow unix:$ovs_base/br-ipsec.node-LEFT.mgmt \
545+
"dl_dst=f2:ff:00:00:00:RIGHT actions=tun-RIGHT"])
546+
fi
547+
])
548+
])
549+
550+
dnl Bring up and add IP addresses for br-ipsec interface.
551+
m4_for([id], [1], NODES, [1], [
552+
echo "================== node-id ========================="
553+
NS_CHECK_EXEC([node-id], [ip addr add 192.0.0.id/24 dev br-ipsec])
554+
NS_CHECK_EXEC([node-id], [ip link set dev br-ipsec up])
555+
])
556+
557+
dnl Wait for all the connections to be loaded and active. In case one of
558+
dnl the pluto processes crashed some of the connections may never become
559+
dnl active. But we did run this loop with a pluto reviving logic twice
560+
dnl already, so the chances for pluto to be down here are much lower.
561+
WAIT_FOR_LOADED_CONNS([active])
562+
563+
dnl Check the full mesh ping.
564+
m4_for([LEFT], [1], NODES, [1], [
565+
m4_for([RIGHT], [1], NODES, [1], [
566+
if test LEFT -ne RIGHT; then
567+
echo "====== ping: node-LEFT --> node-RIGHT =========="
568+
dnl Ping without checking in case connection will recover after the
569+
dnl first packet.
570+
NS_CHECK_EXEC([node-LEFT],
571+
[ping -q -c 1 -W 2 192.0.0.RIGHT | FORMAT_PING],
572+
[ignore], [stdout])
573+
dnl Now check. If this one fails, there is no actual connectivity.
574+
NS_CHECK_EXEC([node-LEFT],
575+
[ping -q -c 3 -i 0.1 -W 2 192.0.0.RIGHT | FORMAT_PING],
576+
[0], [dnl
577+
3 packets transmitted, 3 received, 0% packet loss, time 0ms
578+
])
579+
fi
580+
])])
513581

514582
OVS_TRAFFIC_VSWITCHD_STOP()
515583
AT_CLEANUP

0 commit comments

Comments
 (0)