Skip to content

Commit 3f1265f

Browse files
fruchCopilot
andcommitted
feature(aws): log fallback to ssm based access
since we are running into some issue we are failing to have ssh access but zero logs cause of it (we have multiple time during the years, credentails is or other cloud-init/boot issues) in this change we are gonna make sure ssm-agents are working on our instances, and fallback to log during log collection if we can't have ssh access * added it to the regio configuration to enable it * added it the top of the cloud-init to unmask the agent see: scylladb/scylla-machine-image@b8e494d * `SSMCommandRunner` which have `run()` api as with our ssh based remoters * `CommandLog` collection is falling back to use `SSMCommandRunner` Ref: #11581 Update sdcm/utils/aws_region.py Co-authored-by: Copilot <[email protected]> Update sdcm/provision/aws/utils.py Co-authored-by: Copilot <[email protected]> Update sdcm/utils/aws_ssm_runner.py Co-authored-by: Copilot <[email protected]>
1 parent a6e665d commit 3f1265f

File tree

6 files changed

+581
-13
lines changed

6 files changed

+581
-13
lines changed

sdcm/logcollector.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -217,18 +217,23 @@ class CommandLog(BaseLogEntity):
217217
def collect(self, node, local_dst, remote_dst=None, local_search_path=None) -> Optional[str]:
218218
if not node or not node.remoter or remote_dst is None:
219219
return None
220-
remote_logfile = LogCollector.collect_log_remotely(node=node,
221-
cmd=self.cmd,
222-
log_filename=os.path.join(remote_dst, self.name))
220+
remote_logfile, is_file_remote = LogCollector.collect_log_remotely(node=node,
221+
cmd=self.cmd,
222+
log_filename=os.path.join(remote_dst, self.name))
223223
if not remote_logfile:
224224
LOGGER.warning(
225225
"Nothing to collect. Command '%s' did not prepare log file on remote host '%s'", self.cmd, node.name)
226226
return None
227-
LogCollector.receive_log(node=node,
228-
remote_log_path=remote_logfile,
229-
local_dir=local_dst,
230-
timeout=self.collect_timeout)
231-
return os.path.join(local_dst, os.path.basename(remote_logfile))
227+
local_path = Path(local_dst) / Path(remote_logfile).name
228+
if is_file_remote:
229+
LogCollector.receive_log(node=node,
230+
remote_log_path=remote_logfile,
231+
local_dir=local_dst,
232+
timeout=self.collect_timeout)
233+
else:
234+
# copy locally
235+
shutil.copyfile(remote_logfile, str(local_path))
236+
return str(local_path)
232237

233238

234239
class FileLog(CommandLog):
@@ -633,13 +638,44 @@ def create_remote_storage_dir(self, node, path=''):
633638
return remote_dir
634639

635640
@staticmethod
636-
def collect_log_remotely(node, cmd: str, log_filename: str) -> Optional[str]:
641+
def collect_log_remotely(node, cmd: str, log_filename: str) -> Tuple[Optional[str], bool]:
637642
if not node.remoter:
638-
return None
643+
return None, False
644+
645+
is_file_remote = True
639646
collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
640647
node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
641648
result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
642-
return log_filename if result.ok else None
649+
ok = result.ok
650+
651+
# Check if node is AWS-based
652+
if not ok and hasattr(node, "instance") and node.instance and getattr(node.instance, "cloud", None) == "aws":
653+
LOGGER.debug("Node %s is AWS-based", node.name)
654+
# Use SSM to run the command and save it to a local file
655+
is_file_remote = False
656+
657+
try:
658+
from sdcm.utils.aws_ssm_runner import SSMCommandRunner
659+
660+
region = node.instance.region if hasattr(
661+
node.instance, 'region') else node.instance.placement['AvailabilityZone'][:-1]
662+
ssm_runner = SSMCommandRunner(region_name=region, nstance_id=node.instance.instance_id, )
663+
664+
ssm_result = ssm_runner.run_command_and_save_output(
665+
command=collect_log_command,
666+
local_output_file=log_filename,
667+
comment=f'Collect log {log_filename}',
668+
ignore_status=True
669+
)
670+
671+
ok = ssm_result.ok
672+
if not ok:
673+
LOGGER.error("SSM command failed for instance %s: %s", node.instance.instance_id, ssm_result.stderr)
674+
return None, is_file_remote
675+
except (ImportError, AttributeError, TypeError, ValueError, KeyError, IndexError) as e:
676+
LOGGER.error("Failed to run SSM command: %s", e)
677+
return None, is_file_remote
678+
return log_filename if ok else None, is_file_remote
643679

644680
@staticmethod
645681
def archive_log_remotely(node, log_filename: str, archive_name: Optional[str] = None) -> Optional[str]:

sdcm/provision/aws/configuration_script.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
#
1212
# Copyright (c) 2021 ScyllaDB
1313

14-
from sdcm.provision.aws.utils import network_config_ipv6_workaround_script
14+
from sdcm.provision.aws.utils import (
15+
network_config_ipv6_workaround_script,
16+
enable_ssm_agent_script,
17+
)
1518
from sdcm.provision.common.configuration_script import ConfigurationScriptBuilder
1619

1720

@@ -26,7 +29,8 @@ def _wait_before_running_script(self) -> str:
2629
return 'while ! systemctl status cloud-init.service | grep "active (exited)"; do sleep 1; done\n'
2730

2831
def _script_body(self) -> str:
29-
script = super()._script_body()
32+
script = enable_ssm_agent_script()
33+
script += super()._script_body()
3034
if self.aws_ipv6_workaround:
3135
script += network_config_ipv6_workaround_script()
3236
return script

sdcm/provision/aws/utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,17 @@ def network_config_ipv6_workaround_script():
292292
""")
293293

294294

295+
def enable_ssm_agent_script():
296+
"""Our images come with it masked by default. For testing we want this for debugging purposes, especially when we can't have SSH connectivity."""
297+
return dedent(r"""
298+
if ! systemctl is-active --quiet amazon-ssm-agent; then
299+
systemctl unmask amazon-ssm-agent
300+
systemctl enable amazon-ssm-agent
301+
systemctl start amazon-ssm-agent
302+
fi
303+
""")
304+
305+
295306
def configure_set_preserve_hostname_script():
296307
return 'grep "preserve_hostname: true" /etc/cloud/cloud.cfg 1>/dev/null 2>&1 ' \
297308
'|| echo "preserve_hostname: true" >> /etc/cloud/cloud.cfg\n'

sdcm/utils/aws_region.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class AwsRegion:
3737
SCT_KEY_PAIR_NAME = "scylla_test_id_ed25519" # TODO: change legacy name to sct-keypair-aws
3838
SCT_SSH_GROUP_NAME = 'SCT-ssh-sg'
3939
SCT_SUBNET_PER_AZ = 2 # how many subnets to configure in each region.
40+
SCT_NODES_ROLE_ARN = "qa-scylla-manager-backup-role"
4041

4142
def __init__(self, region_name):
4243
self.region_name = region_name
@@ -651,6 +652,34 @@ def get_vpc_peering_routes(self) -> list[str]:
651652
LOGGER.debug("Discovered %s VPC peering routes in %s", peering_routes, self.region_name)
652653
return peering_routes
653654

655+
@cached_property
656+
def ssm(self):
657+
return boto3.client("ssm", region_name=self.region_name)
658+
659+
@cached_property
660+
def sts(self):
661+
return boto3.client("sts", region_name=self.region_name)
662+
663+
def configure_ssm(self, role_name=SCT_NODES_ROLE_ARN):
664+
"""Ensure that SSM agent can work in the region by adding necessary IAM role and instance profile"""
665+
666+
# Replace with the actual ARN of your IAM role created in step 1
667+
# Example: service-role/AWSSystemsManagerDefaultEC2InstanceManagementRole
668+
# Note: The 'service-role/' prefix is crucial when setting the value.
669+
iam_role_for_dhmc = f"service-role/{role_name}"
670+
671+
account_id = self.sts.get_caller_identity()["Account"]
672+
region = self.ssm.meta.region_name
673+
674+
setting_id = f"arn:aws:ssm:{region}:{account_id}:servicesetting/ssm/managed-instance/default-ec2-instance-management-role"
675+
676+
try:
677+
response = self.ssm.update_service_setting(SettingId=setting_id, SettingValue=iam_role_for_dhmc)
678+
LOGGER.info("Default Host Management Configuration updated successfully.")
679+
LOGGER.debug(response)
680+
except botocore.exceptions.ClientError as e:
681+
LOGGER.error(f"Error updating Default Host Management Configuration: {e}")
682+
654683
def configure(self):
655684
LOGGER.info("Configuring '%s' region...", self.region_name)
656685
self.create_sct_vpc()
@@ -660,4 +689,5 @@ def configure(self):
660689
self.create_sct_security_group()
661690
self.create_sct_ssh_security_group()
662691
self.create_sct_key_pair()
692+
self.configure_ssm()
663693
LOGGER.info("Region configured successfully.")

0 commit comments

Comments
 (0)