Skip to content

Commit 76a50ff

Browse files
committed
hmi: Add test case to trigger TOD topology switch.
This test triggers the TOD topology failover on all the chips to see OPAL TI and panic path to make sure OS does not get stuck while going down. This test needs following skiboot and kernel commit to pass: skiboot: 497734984 opal/hmi: set a flag to inform OS that TOD/TB has failed. ca349b836 opal/hmi: Don't retry TOD recovery if it is already in failed state. 017da88b2 opal/hmi: Fix double unlock of hmi lock in failure path. kernel: http://patchwork.ozlabs.org/patch/1051379/ Signed-off-by: Mahesh Salgaonkar <[email protected]>
1 parent e4e3859 commit 76a50ff

File tree

3 files changed

+136
-0
lines changed

3 files changed

+136
-0
lines changed

common/OPexpect.py

+12
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,15 @@ def expect(self, pattern, timeout=-1, searchwindowsize=-1, async=False):
164164
raise PlatformError(state, log)
165165

166166
return r - len(op_patterns)
167+
168+
'''
169+
Provide function that do not raise any exception. This is usefull for the
170+
tests that do not consider panic or opal TI as test failure. This allows
171+
such testcases to take control and look for specific pattern in system
172+
crash scenario.
173+
'''
174+
def expect_no_fail(self, pattern, timeout=-1, searchwindowsize=-1, async=False):
175+
r = super(spawn,self).expect(pattern,
176+
timeout=timeout,
177+
searchwindowsize=searchwindowsize)
178+
return r

common/OpTestConstants.py

+1
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ class OpTestConstants():
266266
TFMR_DEC_PARITY_ERROR = "0006080000000000"
267267
TFMR_PURR_PARITY_ERROR = "0004080000000000"
268268
TFMR_SPURR_PARITY_ERROR = "0005080000000000"
269+
HMI_TOD_TOPOLOGY_FAILOVER = 7
269270

270271
# CPU sleep states constants
271272
GET_CPU_SLEEP_STATE2 = "cat /sys/devices/system/cpu/cpu*/cpuidle/state2/disable"

testcases/OpTestHMIHandling.py

+123
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,26 @@ def clear_stop(self):
106106
else:
107107
self.assertTrue(False, "OpTestHMIHandling failed to recover from previous OpSystemState.UNKNOWN_BAD")
108108

109+
def handle_panic(self):
110+
rc = self.cv_SYSTEM.console.pty.expect_no_fail(["Kernel panic - not syncing: Unrecoverable HMI exception", pexpect.TIMEOUT, pexpect.EOF], timeout=120)
111+
if rc == 0:
112+
rc = self.cv_SYSTEM.console.pty.expect_no_fail(["ISTEP", pexpect.TIMEOUT, pexpect.EOF], timeout=120)
113+
if rc == 0:
114+
self.cv_SYSTEM.set_state(OpSystemState.IPLing)
115+
self.cv_SYSTEM.goto_state(OpSystemState.OS)
116+
else:
117+
self.assertTrue(False, "OpTestHMIHandling: System failing to reboot after topology recovery failure")
118+
else:
119+
self.assertTrue(False, "OpTestHMIHandling: No panic after topology recovery failure")
120+
121+
def handle_OpalTI(self):
122+
rc = self.cv_SYSTEM.console.pty.expect_no_fail(["ISTEP", pexpect.TIMEOUT, pexpect.EOF], timeout=120)
123+
if rc == 0:
124+
self.cv_SYSTEM.set_state(OpSystemState.IPLing)
125+
self.cv_SYSTEM.goto_state(OpSystemState.OS)
126+
else:
127+
self.assertTrue(False, "System failed to reboot after OPAL TI")
128+
109129
def handle_ipl(self):
110130
rc = self.cv_SYSTEM.console.pty.expect(["ISTEP", "istep", pexpect.TIMEOUT, pexpect.EOF], timeout=180)
111131
log.debug("before={}".format(self.cv_SYSTEM.console.pty.before))
@@ -218,6 +238,54 @@ def form_scom_addr(self, addr, core):
218238
log.debug(val)
219239
return val
220240

241+
def is_node_present(self, node):
242+
''' Check if specified device tree is present or not.'''
243+
self.cv_SYSTEM.goto_state(OpSystemState.OS)
244+
l_cmd = "ls %s" % node
245+
try:
246+
self.cv_HOST.host_run_command(l_cmd, console=1)
247+
except CommandFailed as cf:
248+
'''Node is not present '''
249+
return 0
250+
251+
return 1
252+
253+
def get_OpalSwXstop(self):
254+
self.proc_gen = self.cv_HOST.host_get_proc_gen(console=1)
255+
self.cv_SYSTEM.goto_state(OpSystemState.OS)
256+
try:
257+
o = self.cv_HOST.host_run_command("nvram -p ibm,skiboot --print-config=opal-sw-xstop", console=1)
258+
'''
259+
On a fresh system this isn't set. The command will exit with
260+
exitcode = 255.
261+
On power8 we treat this as enabled
262+
On power9 we treat this as disable.
263+
'''
264+
except CommandFailed as cf:
265+
if cf.exitcode == 255:
266+
if self.proc_gen in ["POWER8", "POWER8E"]:
267+
return "enable"
268+
elif self.proc_gen in ["POWER9"]:
269+
return "disable"
270+
else:
271+
self.assertTrue(False, "get_OpalSwXstop() failed to query nvram.")
272+
return o
273+
274+
def set_OpalSwXstop(self, val):
275+
self.cv_SYSTEM.goto_state(OpSystemState.OS)
276+
o = self.get_OpalSwXstop()
277+
if val in o:
278+
return
279+
280+
l_cmd = "nvram -p ibm,skiboot --update-config opal-sw-xstop=%s" % val
281+
self.cv_HOST.host_run_command(l_cmd, console=1)
282+
o = self.get_OpalSwXstop()
283+
if val in o:
284+
pass
285+
else:
286+
l_msg = "Failed to set opal-sw-xstop config to %s" % val
287+
self.assertTrue(False, l_msg)
288+
221289
def clearGardEntries(self):
222290
self.cv_SYSTEM.goto_state(OpSystemState.OS)
223291
expect_prompt = self.cv_SYSTEM.util.build_prompt()
@@ -374,6 +442,8 @@ def _testHMIHandling(self, i_test):
374442
self._testTFMR_Errors(BMC_CONST.TFMR_DEC_PARITY_ERROR)
375443
self._testTFMR_Errors(BMC_CONST.TFMR_PURR_PARITY_ERROR)
376444
self._testTFMR_Errors(BMC_CONST.TFMR_SPURR_PARITY_ERROR)
445+
elif l_test == BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER:
446+
self._test_tod_topology_failover()
377447
else:
378448
raise Exception("Please provide valid test case")
379449
l_con.run_command("dmesg -C")
@@ -494,6 +564,43 @@ def _test_malfunction_alert(self):
494564
console.pty.sendline(l_cmd)
495565
self.handle_ipl()
496566

567+
def _test_tod_topology_failover(self):
568+
'''
569+
This function is used to test error path for hmi TOD topology failover.
570+
On HMI recovery failure TOD/TB goes in invalid state and stops running.
571+
In this case kernel should either
572+
a) panic followed by clean reboot. (For opal-sw-xstop=disable)
573+
OR
574+
b) cause OPAL TI by triggering sw checkstop to OCC. (For
575+
opal-sw-xstop=enable)
576+
577+
In both cases we should not see any hangs at Linux OS level.
578+
To simulate error condition inject TOD topology failover on all the
579+
chips until we see HMI failure.
580+
'''
581+
scom_addr = "0x40000"
582+
l_error = "0x4000000000000000"
583+
l_test_mode = "TI"
584+
585+
g = self.get_OpalSwXstop()
586+
if "disable" in g:
587+
l_test_mode="panic"
588+
589+
console = self.cv_SYSTEM.console
590+
l_cmd = ""
591+
for l_pair in self.l_dic:
592+
l_chip = l_pair[0]
593+
l_cmd_str = "PATH=/usr/local/sbin:$PATH putscom -c %s %s %s; " % (l_chip, scom_addr, l_error)
594+
l_cmd = l_cmd + l_cmd_str
595+
596+
console.pty.sendline(l_cmd)
597+
if l_test_mode == "panic":
598+
self.handle_panic()
599+
else:
600+
self.handle_OpalTI()
601+
602+
return
603+
497604
def _test_hyp_resource_err(self):
498605
'''
499606
This function is used to test HMI: Hypervisor resource error
@@ -650,6 +757,20 @@ def runTest(self):
650757
self._testHMIHandling(BMC_CONST.HMI_MALFUNCTION_ALERT)
651758
self.clearGardEntries()
652759

760+
class TodTopologyFailoverPanic(OpTestHMIHandling):
761+
def runTest(self):
762+
self.set_OpalSwXstop("disable")
763+
self._testHMIHandling(BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER)
764+
765+
class TodTopologyFailoverOpalTI(OpTestHMIHandling):
766+
def runTest(self):
767+
rc = self.is_node_present("/proc/device-tree/ibm,sw-checkstop-fir")
768+
if rc == 1:
769+
self.set_OpalSwXstop("enable")
770+
self._testHMIHandling(BMC_CONST.HMI_TOD_TOPOLOGY_FAILOVER)
771+
else:
772+
self.skipTest("OPAL TI not supported on this system.")
773+
653774
class HypervisorResourceError(OpTestHMIHandling):
654775
def runTest(self):
655776
self._testHMIHandling(BMC_CONST.HMI_HYPERVISOR_RESOURCE_ERROR)
@@ -663,6 +784,8 @@ def unrecoverable_suite():
663784
s = unittest.TestSuite()
664785
s.addTest(MalfunctionAlert())
665786
s.addTest(HypervisorResourceError())
787+
s.addTest(TodTopologyFailoverPanic())
788+
s.addTest(TodTopologyFailoverOpalTI())
666789
s.addTest(ClearGard())
667790
return s
668791

0 commit comments

Comments
 (0)