Skip to content

Commit feae60c

Browse files
authored
[sled-agent] Replace tokio::spawn'd mg-ddm calls with a reconciler task (#7521)
Prior to this PR, sled-agent would spawn three infinite retry-until-success tasks on startup: 1. Sending mg-ddm enough information to turn on its Oximeter stats 2. Telling mg-ddm to advertise the sled's bootstrap prefix 3. Telling mg-ddm to advertise the sled's underlay prefix Additionally, it would spawn an infinite retry-until-success task for any internal DNS zone it started (telling mg-ddm to advertise the internal DNS prefix). This PR replaces all of these with a `DdmReconciler`, which is responsible for taking all of the above information and periodically syncing with the local mg-ddm instance to ensure it has up-to-date information. Fixes #7377. Builds on #7507.
1 parent f2d20c8 commit feae60c

File tree

7 files changed

+348
-209
lines changed

7 files changed

+348
-209
lines changed

clients/ddm-admin-client/src/lib.rs

+18-2
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,30 @@ impl Client {
7575

7676
pub async fn advertise_prefixes(
7777
&self,
78-
prefixes: Vec<Ipv6Net>,
78+
prefixes: &Vec<Ipv6Net>,
7979
) -> Result<(), Error<types::Error>> {
8080
self.inner
81-
.advertise_prefixes(&prefixes)
81+
.advertise_prefixes(prefixes)
8282
.await
8383
.map(|resp| resp.into_inner())
8484
}
8585

86+
pub async fn withdraw_prefixes(
87+
&self,
88+
prefixes: &Vec<Ipv6Net>,
89+
) -> Result<(), Error<types::Error>> {
90+
self.inner
91+
.withdraw_prefixes(prefixes)
92+
.await
93+
.map(|resp| resp.into_inner())
94+
}
95+
96+
pub async fn get_originated(
97+
&self,
98+
) -> Result<Vec<Ipv6Net>, Error<types::Error>> {
99+
self.inner.get_originated().await.map(|resp| resp.into_inner())
100+
}
101+
86102
pub async fn enable_stats(
87103
&self,
88104
request: &EnableStatsRequest,

installinator/src/bootstrap.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ pub(crate) async fn bootstrap_sled(
9292
"Sending prefix to ddmd for advertisement";
9393
"prefix" => ?prefix,
9494
);
95-
ddmd_client.advertise_prefixes(vec![prefix]).await?;
95+
ddmd_client.advertise_prefixes(&vec![prefix]).await?;
9696
Ok(())
9797
},
9898
|err, duration| {

sled-agent/src/bootstrap/pre_server.rs

+12-58
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use super::pumpkind;
1515
use super::server::StartError;
1616
use crate::config::Config;
1717
use crate::config::SidecarRevision;
18+
use crate::ddm_reconciler::DdmReconciler;
1819
use crate::long_running_tasks::{
1920
LongRunningTaskHandles, spawn_all_longrunning_tasks,
2021
};
@@ -34,25 +35,19 @@ use illumos_utils::zone;
3435
use illumos_utils::zone::Zones;
3536
use omicron_common::FileKv;
3637
use omicron_common::address::Ipv6Subnet;
37-
use omicron_common::address::SLED_PREFIX;
38-
use omicron_common::backoff::retry_notify;
39-
use omicron_common::backoff::retry_policy_internal_service_aggressive;
40-
use omicron_ddm_admin_client::Client as DdmAdminClient;
4138
use sled_hardware::DendriteAsic;
4239
use sled_hardware::SledMode;
4340
use sled_hardware::underlay;
4441
use sled_hardware_types::underlay::BootstrapInterface;
4542
use slog::Drain;
4643
use slog::Logger;
47-
use slog_error_chain::InlineErrorChain;
4844
use std::net::IpAddr;
4945
use std::net::Ipv6Addr;
5046
use tokio::sync::oneshot;
5147

5248
pub(super) struct BootstrapAgentStartup {
5349
pub(super) config: Config,
5450
pub(super) global_zone_bootstrap_ip: Ipv6Addr,
55-
pub(super) ddm_admin_localhost_client: DdmAdminClient,
5651
pub(super) base_log: Logger,
5752
pub(super) startup_log: Logger,
5853
pub(super) service_manager: ServiceManager,
@@ -77,72 +72,32 @@ impl BootstrapAgentStartup {
7772

7873
// Perform several blocking startup tasks first; we move `config` and
7974
// `log` into this task, and on success, it gives them back to us.
80-
let (config, log, ddm_admin_localhost_client, startup_networking) =
75+
let (config, log, startup_networking) =
8176
tokio::task::spawn_blocking(move || {
8277
enable_mg_ddm(&config, &log)?;
8378
pumpkind::enable_pumpkind_service(&log)?;
8479
ensure_zfs_key_directory_exists(&log)?;
8580

8681
let startup_networking = BootstrapNetworking::setup(&config)?;
8782

88-
// Spawn a background task to notify our local ddmd of our
89-
// bootstrap address so it can advertise it to other sleds.
90-
//
91-
// TODO-cleanup Spawning a task here is a little suspect;
92-
// consider moving to a reconciler loop when we do omicron#7377.
93-
let ddmd_client = DdmAdminClient::localhost(&log)
94-
.map_err(StartError::CreateDdmAdminLocalhostClient)?;
95-
tokio::spawn({
96-
let prefix = Ipv6Subnet::<SLED_PREFIX>::new(
97-
startup_networking.global_zone_bootstrap_ip,
98-
)
99-
.net();
100-
let ddmd_client = ddmd_client.clone();
101-
async move {
102-
retry_notify(
103-
retry_policy_internal_service_aggressive(),
104-
|| async {
105-
info!(
106-
ddmd_client.log(),
107-
"Sending underlay prefix to \
108-
ddmd for advertisement";
109-
"prefix" => ?prefix,
110-
);
111-
ddmd_client
112-
.advertise_prefixes(vec![prefix])
113-
.await?;
114-
Ok(())
115-
},
116-
|err, duration| {
117-
info!(
118-
ddmd_client.log(),
119-
"Failed to notify ddmd \
120-
of our address (will retry)";
121-
"retry_after" => ?duration,
122-
InlineErrorChain::new(&err),
123-
);
124-
},
125-
)
126-
.await
127-
.expect("retry policy retries until success");
128-
}
129-
});
130-
13183
// Before we create the switch zone, we need to ensure that the
13284
// necessary ZFS and Zone resources are ready. All other zones
13385
// are created on U.2 drives.
13486
ensure_zfs_ramdisk_dataset()?;
13587

136-
Ok::<_, StartError>((
137-
config,
138-
log,
139-
ddmd_client,
140-
startup_networking,
141-
))
88+
Ok::<_, StartError>((config, log, startup_networking))
14289
})
14390
.await
14491
.unwrap()?;
14592

93+
// Start the DDM reconciler, giving it our bootstrap subnet to
94+
// advertise to other sleds.
95+
let ddm_reconciler = DdmReconciler::new(
96+
Ipv6Subnet::new(startup_networking.global_zone_bootstrap_ip),
97+
&base_log,
98+
)
99+
.map_err(StartError::CreateDdmAdminLocalhostClient)?;
100+
146101
// Before we start monitoring for hardware, ensure we're running from a
147102
// predictable state.
148103
//
@@ -184,7 +139,7 @@ impl BootstrapAgentStartup {
184139

185140
let service_manager = ServiceManager::new(
186141
&base_log,
187-
ddm_admin_localhost_client.clone(),
142+
ddm_reconciler,
188143
startup_networking,
189144
sled_mode,
190145
time_sync,
@@ -204,7 +159,6 @@ impl BootstrapAgentStartup {
204159
Ok(Self {
205160
config,
206161
global_zone_bootstrap_ip,
207-
ddm_admin_localhost_client,
208162
base_log,
209163
startup_log: log,
210164
service_manager,

sled-agent/src/bootstrap/server.rs

+9-83
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,8 @@ use illumos_utils::zone;
3434
use illumos_utils::zone::Zones;
3535
use internal_dns_resolver::Resolver;
3636
use omicron_common::address::{AZ_PREFIX, Ipv6Subnet};
37-
use omicron_common::backoff::retry_notify;
38-
use omicron_common::backoff::retry_policy_internal_service_aggressive;
3937
use omicron_common::ledger;
4038
use omicron_common::ledger::Ledger;
41-
use omicron_ddm_admin_client::Client as DdmAdminClient;
4239
use omicron_ddm_admin_client::DdmError;
4340
use omicron_ddm_admin_client::types::EnableStatsRequest;
4441
use omicron_uuid_kinds::GenericUuid;
@@ -49,7 +46,6 @@ use sled_hardware::underlay;
4946
use sled_storage::dataset::CONFIG_DATASET;
5047
use sled_storage::manager::StorageHandle;
5148
use slog::Logger;
52-
use slog_error_chain::InlineErrorChain;
5349
use std::io;
5450
use std::net::SocketAddr;
5551
use std::net::SocketAddrV6;
@@ -180,7 +176,6 @@ impl Server {
180176
let BootstrapAgentStartup {
181177
config,
182178
global_zone_bootstrap_ip,
183-
ddm_admin_localhost_client,
184179
base_log,
185180
startup_log,
186181
service_manager,
@@ -251,7 +246,6 @@ impl Server {
251246
start_sled_agent_request,
252247
long_running_task_handles.clone(),
253248
service_manager.clone(),
254-
&ddm_admin_localhost_client,
255249
&base_log,
256250
&startup_log,
257251
)
@@ -282,7 +276,6 @@ impl Server {
282276
state,
283277
sled_init_rx,
284278
sled_reset_rx,
285-
ddm_admin_localhost_client,
286279
long_running_task_handles,
287280
service_manager,
288281
_sprockets_server_handle: sprockets_server_handle,
@@ -359,7 +352,6 @@ async fn start_sled_agent(
359352
request: StartSledAgentRequest,
360353
long_running_task_handles: LongRunningTaskHandles,
361354
service_manager: ServiceManager,
362-
ddmd_client: &DdmAdminClient,
363355
base_log: &Logger,
364356
log: &Logger,
365357
) -> Result<SledAgentServer, SledAgentServerStartError> {
@@ -398,88 +390,24 @@ async fn start_sled_agent(
398390
// Inform the storage service that the key manager is available
399391
long_running_task_handles.storage_manager.key_manager_ready().await;
400392

401-
// Spawn a background task to notify our local ddmd of our underlay address
402-
// so it can advertise it to other sleds.
403-
//
404-
// TODO-cleanup Spawning a task here is a little suspect; consider moving to
405-
// a reconciler loop when we do omicron#7377.
406-
//
407-
// TODO-security This ddmd_client is used to advertise both this
408-
// (underlay) address and our bootstrap address. Bootstrap addresses are
409-
// unauthenticated (connections made on them are auth'd via sprockets),
410-
// but underlay addresses should be exchanged via authenticated channels
411-
// between ddmd instances. It's TBD how that will work, but presumably
412-
// we'll need to do something different here for underlay vs bootstrap
413-
// addrs (either talk to a differently-configured ddmd, or include info
414-
// indicating which kind of address we're advertising).
415-
tokio::spawn({
416-
let prefix = request.body.subnet.net();
417-
let ddmd_client = ddmd_client.clone();
418-
async move {
419-
retry_notify(
420-
retry_policy_internal_service_aggressive(),
421-
|| async {
422-
info!(
423-
ddmd_client.log(),
424-
"Sending underlay prefix to ddmd for advertisement";
425-
"prefix" => ?prefix,
426-
);
427-
ddmd_client.advertise_prefixes(vec![prefix]).await?;
428-
Ok(())
429-
},
430-
|err, duration| {
431-
info!(
432-
ddmd_client.log(),
433-
"Failed to notify ddmd of our address (will retry)";
434-
"retry_after" => ?duration,
435-
InlineErrorChain::new(&err),
436-
);
437-
},
438-
)
439-
.await
440-
.expect("retry policy retries until success");
441-
}
442-
});
393+
// Inform our DDM reconciler of our underlay subnet and the information it
394+
// needs for maghemite to enable Oximeter stats.
395+
{
396+
let ddm_reconciler = service_manager.ddm_reconciler();
443397

444-
// Also spawn a task to give ddmd the config it needs to start its oximeter
445-
// stats producer.
446-
tokio::spawn({
447398
let az_prefix =
448399
Ipv6Subnet::<AZ_PREFIX>::new(request.body.subnet.net().addr());
449400
let addr = request.body.subnet.net().iter().nth(1).unwrap();
450401
let dns_servers = Resolver::servers_from_subnet(az_prefix);
451-
let request = EnableStatsRequest {
402+
403+
ddm_reconciler.set_underlay_subnet(request.body.subnet);
404+
ddm_reconciler.enable_stats(EnableStatsRequest {
452405
addr: addr.into(),
453406
dns_servers: dns_servers.iter().map(|x| x.to_string()).collect(),
454407
rack_id: request.body.rack_id,
455408
sled_id: request.body.id.into_untyped_uuid(),
456-
};
457-
let ddmd_client = ddmd_client.clone();
458-
459-
async move {
460-
retry_notify(
461-
retry_policy_internal_service_aggressive(),
462-
|| async {
463-
info!(
464-
ddmd_client.log(), "Enabling ddm stats";
465-
"request" => ?request,
466-
);
467-
ddmd_client.enable_stats(&request).await?;
468-
Ok(())
469-
},
470-
|err, duration| {
471-
info!(
472-
ddmd_client.log(),
473-
"Failed enable ddm stats (will retry)";
474-
"retry_after" => ?duration,
475-
InlineErrorChain::new(&err),
476-
);
477-
},
478-
)
479-
.await
480-
.unwrap();
481-
}
482-
});
409+
});
410+
}
483411

484412
// Server does not exist, initialize it.
485413
let server = SledAgentServer::start(
@@ -574,7 +502,6 @@ struct Inner {
574502
oneshot::Sender<Result<SledAgentResponse, String>>,
575503
)>,
576504
sled_reset_rx: mpsc::Receiver<oneshot::Sender<Result<(), BootstrapError>>>,
577-
ddm_admin_localhost_client: DdmAdminClient,
578505
long_running_task_handles: LongRunningTaskHandles,
579506
service_manager: ServiceManager,
580507
_sprockets_server_handle: JoinHandle<()>,
@@ -637,7 +564,6 @@ impl Inner {
637564
request,
638565
self.long_running_task_handles.clone(),
639566
self.service_manager.clone(),
640-
&self.ddm_admin_localhost_client,
641567
&self.base_log,
642568
&log,
643569
)

0 commit comments

Comments
 (0)