Skip to content

Commit c1edabb

Browse files
committed
CA-420533: Only clear RestartVM guidance on up-to-date hosts
During rolling pool upgrade (RPU), RestartVM guidance should only be cleared when a VM restarts on a host that has been updated to match the coordinator's software version. Previously, the guidance was cleared whenever a VM restarted, regardless of the host's update status. This commit ensures that RestartVM guidance persists until the VM restarts on an up-to-date host, this provides accurate feedback to administrators about which VMs still need restarting after RPU. Also adds unit tests covering 6 scenarios: * VM restart on updated vs old host (via xenopsd) * VM halt on updated vs old host (via force_state_reset) * Suspended VM resume on updated vs old host Signed-off-by: Gang Ji <[email protected]>
1 parent c57e943 commit c1edabb

File tree

3 files changed

+201
-4
lines changed

3 files changed

+201
-4
lines changed

ocaml/tests/test_xapi_xenops.ml

Lines changed: 186 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,84 @@ open Test_common
33
module D = Debug.Make (struct let name = "test_xapi_xenops" end)
44

55
open D
6+
module Date = Clock.Date
7+
8+
(** Helper to create a Xenops VM state for testing *)
9+
let make_xenops_state ~power_state ?(last_start_time = 0.0) () =
10+
let open Xenops_interface.Vm in
11+
{
12+
power_state
13+
; domids= [0]
14+
; consoles= []
15+
; memory_target= 0L
16+
; memory_actual= 0L
17+
; memory_limit= 0L
18+
; vcpu_target= 1
19+
; shadow_multiplier_target= 1.0
20+
; rtc_timeoffset= ""
21+
; uncooperative_balloon_driver= false
22+
; guest_agent= []
23+
; xsdata_state= []
24+
; pv_drivers_detected= false
25+
; last_start_time
26+
; hvm= false
27+
; nomigrate= false
28+
; nested_virt= false
29+
; domain_type= Domain_PV
30+
; featureset= ""
31+
}
32+
33+
(** Helper to set up VM for testing: sets pending guidances, resident host, and power state *)
34+
let setup_vm_for_test ~__context ~vm ~guidances ~resident_on ~power_state =
35+
Db.VM.set_pending_guidances ~__context ~self:vm ~value:guidances ;
36+
Db.VM.set_resident_on ~__context ~self:vm ~value:resident_on ;
37+
Db.VM.set_power_state ~__context ~self:vm ~value:power_state
38+
39+
(** Helper to check pending guidances after an operation *)
40+
let check_pending_guidances ~__context ~vm ~expect_restart_vm
41+
~expect_restart_device_model ~test_description =
42+
let remaining = Db.VM.get_pending_guidances ~__context ~self:vm in
43+
Alcotest.(check bool)
44+
(Printf.sprintf "restart_vm guidance %s - %s"
45+
(if expect_restart_vm then "present" else "cleared")
46+
test_description
47+
)
48+
expect_restart_vm
49+
(List.mem `restart_vm remaining) ;
50+
Alcotest.(check bool)
51+
(Printf.sprintf "restart_device_model guidance %s - %s"
52+
(if expect_restart_device_model then "present" else "cleared")
53+
test_description
54+
)
55+
expect_restart_device_model
56+
(List.mem `restart_device_model remaining)
57+
58+
(** Helper to simulate a VM state update via update_vm_internal *)
59+
let simulate_vm_state_update ~__context ~vm ~previous_power_state
60+
~new_power_state ~localhost =
61+
let previous_state = make_xenops_state ~power_state:previous_power_state () in
62+
let new_state =
63+
make_xenops_state ~power_state:new_power_state ~last_start_time:100.0 ()
64+
in
65+
let vm_uuid = Db.VM.get_uuid ~__context ~self:vm in
66+
let metrics = Db.VM.get_metrics ~__context ~self:vm in
67+
Db.VM_metrics.set_start_time ~__context ~self:metrics
68+
~value:(Date.of_unix_time 50.0) ;
69+
ignore
70+
(Xapi_xenops.update_vm_internal ~__context ~id:vm_uuid ~self:vm
71+
~previous:(Some previous_state) ~info:(Some new_state) ~localhost
72+
)
73+
74+
(** Helper to set host software version *)
75+
let set_host_software_version ~__context ~host ~platform_version ~xapi_version =
76+
Db.Host.remove_from_software_version ~__context ~self:host
77+
~key:Xapi_globs._platform_version ;
78+
Db.Host.add_to_software_version ~__context ~self:host
79+
~key:Xapi_globs._platform_version ~value:platform_version ;
80+
Db.Host.remove_from_software_version ~__context ~self:host
81+
~key:Xapi_globs._xapi_version ;
82+
Db.Host.add_to_software_version ~__context ~self:host
83+
~key:Xapi_globs._xapi_version ~value:xapi_version
684

785
let simulator_setup = ref false
886

@@ -187,4 +265,111 @@ let test_xapi_restart () =
187265
)
188266
unsetup_simulator
189267

190-
let test = [("test_xapi_restart", `Quick, test_xapi_restart)]
268+
(** Test that RestartVM guidance is only cleared when VM restarts on up-to-date host *)
269+
let test_pending_guidance_clearing () =
270+
let __context = make_test_database () in
271+
Context.set_test_rpc __context (Mock_rpc.rpc __context) ;
272+
273+
let localhost = Helpers.get_localhost ~__context in
274+
let host2 = make_host ~__context ~name_label:"host2" ~hostname:"host2" () in
275+
276+
(* Set up software versions - localhost is up-to-date, host2 is not *)
277+
set_host_software_version ~__context ~host:localhost ~platform_version:"1.2.3"
278+
~xapi_version:"4.5.6" ;
279+
set_host_software_version ~__context ~host:host2 ~platform_version:"1.2.2"
280+
~xapi_version:"4.5.5" ;
281+
282+
(* Set localhost as the pool coordinator *)
283+
let pool = Db.Pool.get_all ~__context |> List.hd in
284+
Db.Pool.set_master ~__context ~self:pool ~value:localhost ;
285+
286+
let vm = make_vm ~__context () in
287+
288+
(* Set up VM guidances - both restart_vm and restart_device_model *)
289+
let guidances = [`restart_vm; `restart_device_model] in
290+
291+
(* Test 1: VM running on up-to-date host - should clear restart_vm *)
292+
setup_vm_for_test ~__context ~vm ~guidances ~resident_on:localhost
293+
~power_state:`Halted ;
294+
simulate_vm_state_update ~__context ~vm
295+
~previous_power_state:Xenops_interface.Halted
296+
~new_power_state:Xenops_interface.Running ~localhost ;
297+
check_pending_guidances ~__context ~vm ~expect_restart_vm:false
298+
~expect_restart_device_model:false
299+
~test_description:"VM restarted on up-to-date host" ;
300+
301+
(* Test 2: VM running on old host - should NOT clear restart_vm *)
302+
setup_vm_for_test ~__context ~vm ~guidances ~resident_on:host2
303+
~power_state:`Halted ;
304+
simulate_vm_state_update ~__context ~vm
305+
~previous_power_state:Xenops_interface.Halted
306+
~new_power_state:Xenops_interface.Running ~localhost:host2 ;
307+
check_pending_guidances ~__context ~vm ~expect_restart_vm:true
308+
~expect_restart_device_model:false
309+
~test_description:"VM restarted on old host" ;
310+
311+
(* Test 3: VM halted on up-to-date host - should clear restart_vm *)
312+
setup_vm_for_test ~__context ~vm ~guidances ~resident_on:localhost
313+
~power_state:`Running ;
314+
Xapi_vm_lifecycle.force_state_reset_keep_current_operations ~__context
315+
~self:vm ~value:`Halted ;
316+
check_pending_guidances ~__context ~vm ~expect_restart_vm:false
317+
~expect_restart_device_model:false
318+
~test_description:"halted VM on up-to-date host" ;
319+
320+
(* Test 4: VM halted on old host - should NOT clear restart_vm *)
321+
setup_vm_for_test ~__context ~vm ~guidances ~resident_on:host2
322+
~power_state:`Running ;
323+
Xapi_vm_lifecycle.force_state_reset_keep_current_operations ~__context
324+
~self:vm ~value:`Halted ;
325+
check_pending_guidances ~__context ~vm ~expect_restart_vm:true
326+
~expect_restart_device_model:false ~test_description:"halted VM on old host"
327+
328+
let test_pending_guidance_suspended_vm () =
329+
let __context = make_test_database () in
330+
Context.set_test_rpc __context (Mock_rpc.rpc __context) ;
331+
332+
let localhost = Helpers.get_localhost ~__context in
333+
let host2 = make_host ~__context ~name_label:"host2" ~hostname:"host2" () in
334+
335+
(* Set up software versions - localhost is up-to-date, host2 is not *)
336+
set_host_software_version ~__context ~host:localhost ~platform_version:"1.2.3"
337+
~xapi_version:"4.5.6" ;
338+
set_host_software_version ~__context ~host:host2 ~platform_version:"1.2.2"
339+
~xapi_version:"4.5.5" ;
340+
341+
(* Set localhost as the pool coordinator *)
342+
let pool = Db.Pool.get_all ~__context |> List.hd in
343+
Db.Pool.set_master ~__context ~self:pool ~value:localhost ;
344+
345+
(* Test 1: Suspended VM resumed on up-to-date host - should NOT clear any guidance *)
346+
let vm = make_vm ~__context () in
347+
let guidances = [`restart_vm; `restart_device_model] in
348+
setup_vm_for_test ~__context ~vm ~guidances ~resident_on:localhost
349+
~power_state:`Suspended ;
350+
simulate_vm_state_update ~__context ~vm
351+
~previous_power_state:Xenops_interface.Suspended
352+
~new_power_state:Xenops_interface.Running ~localhost ;
353+
check_pending_guidances ~__context ~vm ~expect_restart_vm:true
354+
~expect_restart_device_model:true
355+
~test_description:"resumed suspended VM on up-to-date host" ;
356+
357+
(* Test 2: Suspended VM resumed on old host - should NOT clear any guidance *)
358+
setup_vm_for_test ~__context ~vm ~guidances ~resident_on:host2
359+
~power_state:`Suspended ;
360+
simulate_vm_state_update ~__context ~vm
361+
~previous_power_state:Xenops_interface.Suspended
362+
~new_power_state:Xenops_interface.Running ~localhost:host2 ;
363+
check_pending_guidances ~__context ~vm ~expect_restart_vm:true
364+
~expect_restart_device_model:true
365+
~test_description:"resumed suspended VM on old host"
366+
367+
let test =
368+
[
369+
("test_xapi_restart", `Quick, test_xapi_restart)
370+
; ("test_pending_guidance_clearing", `Quick, test_pending_guidance_clearing)
371+
; ( "test_pending_guidance_suspended_vm"
372+
, `Quick
373+
, test_pending_guidance_suspended_vm
374+
)
375+
]

ocaml/xapi/xapi_vm_lifecycle.ml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -909,7 +909,14 @@ let force_state_reset_keep_current_operations ~__context ~self ~value:state =
909909
(* Blank the requires_reboot flag *)
910910
Db.VM.set_requires_reboot ~__context ~self ~value:false ;
911911
remove_pending_guidance ~__context ~self ~value:`restart_device_model ;
912-
remove_pending_guidance ~__context ~self ~value:`restart_vm
912+
(* Only remove RestartVM guidance if host is up-to-date with coordinator *)
913+
let resident_on = Db.VM.get_resident_on ~__context ~self in
914+
if
915+
Db.is_valid_ref __context resident_on
916+
&& Helpers.Checks.RPU.are_host_versions_same_on_master ~__context
917+
~host:resident_on
918+
then
919+
remove_pending_guidance ~__context ~self ~value:`restart_vm
913920
) ;
914921
(* Do not clear resident_on for VM and VGPU in a checkpoint operation *)
915922
if

ocaml/xapi/xapi_xenops.ml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2350,8 +2350,13 @@ let update_vm_internal ~__context ~id ~self ~previous ~info ~localhost =
23502350
then (
23512351
Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self
23522352
~value:`restart_device_model ;
2353-
Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self
2354-
~value:`restart_vm
2353+
(* Only remove RestartVM guidance if host is up-to-date with coordinator *)
2354+
if
2355+
Helpers.Checks.RPU.are_host_versions_same_on_master ~__context
2356+
~host:localhost
2357+
then
2358+
Xapi_vm_lifecycle.remove_pending_guidance ~__context ~self
2359+
~value:`restart_vm
23552360
)
23562361
) ;
23572362
create_guest_metrics_if_needed () ;

0 commit comments

Comments
 (0)