Skip to content

Commit 5aad56a

Browse files
fruchCopilot
andcommitted
feature(aws): log fallback to ssm based access
since we are running into some issue we are failing to have ssh access but zero logs cause of it (we have multiple time during the years, credentails is or other cloud-init/boot issues) in this change we are gonna make sure ssm-agents are working on our instances, and fallback to log during log collection if we can't have ssh access * added it to the regio configuration to enable it * added it the top of the cloud-init to unmask the agent see: scylladb/scylla-machine-image@b8e494d * `SSMCommandRunner` which have `run()` api as with our ssh based remoters * `CommandLog` collection is falling back to use `SSMCommandRunner` Ref: #11581 Update sdcm/utils/aws_region.py Co-authored-by: Copilot <[email protected]> Update sdcm/provision/aws/utils.py Co-authored-by: Copilot <[email protected]> Update sdcm/utils/aws_ssm_runner.py Co-authored-by: Copilot <[email protected]>
1 parent 1513fba commit 5aad56a

File tree

6 files changed

+625
-26
lines changed

6 files changed

+625
-26
lines changed

sdcm/logcollector.py

Lines changed: 91 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
from dataclasses import dataclass
2929

3030
import requests
31-
from invoke.exceptions import UnexpectedExit
31+
from invoke.exceptions import UnexpectedExit, Failure as InvokeFailure
32+
from botocore.exceptions import ClientError
3233

3334
import sdcm.monitorstack.ui as monitoring_ui
3435
from sdcm.paths import SCYLLA_YAML_PATH, SCYLLA_PROPERTIES_PATH, SCYLLA_MANAGER_AGENT_YAML_PATH, \
@@ -65,6 +66,8 @@
6566
from sdcm.utils.gce_utils import gce_public_addresses, gce_private_addresses
6667
from sdcm.localhost import LocalHost
6768
from sdcm.cloud_api_client import ScyllaCloudAPIClient
69+
from sdcm.utils.aws_ssm_runner import SSMCommandRunner
70+
from sdcm.remote.libssh2_client.exceptions import Failure as Libssh2_Failure
6871

6972
LOGGER = logging.getLogger(__name__)
7073

@@ -97,6 +100,17 @@ def distro(self):
97100
LOGGER.info("Detected Linux distribution: %s", _distro.name)
98101
return _distro
99102

103+
@cached_property
104+
def is_aws(self) -> bool:
105+
return isinstance(self._instance, dict) and self._instance.get("InstanceId") is not None
106+
107+
@cached_property
108+
def aws_ssm_runner(self) -> Optional[SSMCommandRunner]:
109+
if self.is_aws:
110+
region = self._instance.get("Placement").get("AvailabilityZone")[:-1]
111+
return SSMCommandRunner(region_name=region, instance_id=self._instance.get("InstanceId"))
112+
return None
113+
100114
@retrying(n=30, sleep_time=15, allowed_exceptions=(UnexpectedExit, Libssh2_UnexpectedExit,))
101115
def install_package(self,
102116
package_name: str,
@@ -217,18 +231,24 @@ class CommandLog(BaseLogEntity):
217231
def collect(self, node, local_dst, remote_dst=None, local_search_path=None) -> Optional[str]:
218232
if not node or not node.remoter or remote_dst is None:
219233
return None
220-
remote_logfile = LogCollector.collect_log_remotely(node=node,
221-
cmd=self.cmd,
222-
log_filename=os.path.join(remote_dst, self.name))
234+
235+
remote_logfile, is_file_remote = LogCollector.collect_log_remotely(node=node,
236+
cmd=self.cmd,
237+
log_filename=os.path.join(remote_dst, self.name))
223238
if not remote_logfile:
224239
LOGGER.warning(
225240
"Nothing to collect. Command '%s' did not prepare log file on remote host '%s'", self.cmd, node.name)
226241
return None
227-
LogCollector.receive_log(node=node,
228-
remote_log_path=remote_logfile,
229-
local_dir=local_dst,
230-
timeout=self.collect_timeout)
231-
return os.path.join(local_dst, os.path.basename(remote_logfile))
242+
local_path = Path(local_dst) / Path(remote_logfile).name
243+
if is_file_remote:
244+
LogCollector.receive_log(node=node,
245+
remote_log_path=remote_logfile,
246+
local_dir=local_dst,
247+
timeout=self.collect_timeout)
248+
else:
249+
# copy locally
250+
shutil.copyfile(remote_logfile, str(local_path))
251+
return str(local_path)
232252

233253

234254
class FileLog(CommandLog):
@@ -617,29 +637,76 @@ def create_local_storage_dir(self, base_local_dir):
617637
def create_remote_storage_dir(self, node, path=''):
618638
if not path:
619639
path = node.name
620-
try:
621-
remote_dir = os.path.join(self.node_remote_dir, path)
622-
result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
640+
remote_dir = os.path.join(self.node_remote_dir, path)
623641

624-
if result.exited > 0:
625-
LOGGER.error(
626-
'Remote storing folder not created.\n{}'.format(result))
627-
remote_dir = self.node_remote_dir
642+
if ssh_connected := node.remoter.is_up():
628643

629-
except Exception as details: # noqa: BLE001
630-
LOGGER.error("Error during creating remote directory %s", details)
644+
try:
645+
result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
646+
647+
if result.exited > 0:
648+
LOGGER.error(
649+
'Remote storing folder not created.\n{}'.format(result))
650+
remote_dir = self.node_remote_dir
651+
652+
except (Libssh2_Failure, InvokeFailure) as details:
653+
LOGGER.error("Error during creating remote directory %s", details)
654+
elif not ssh_connected and (ssm_runner := node.aws_ssm_runner):
655+
try:
656+
ssm_result = ssm_runner.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
657+
ok = ssm_result.ok
658+
if not ok:
659+
LOGGER.error("SSM command failed for instance %s: mkdir", node._instance.get("InstanceId"))
660+
661+
except (ClientError, AttributeError) as e:
662+
LOGGER.error("Failed to run SSM command: %s", e)
663+
remote_dir = self.node_remote_dir
664+
else:
631665
remote_dir = self.node_remote_dir
632666

633667
return remote_dir
634668

635669
@staticmethod
636-
def collect_log_remotely(node, cmd: str, log_filename: str) -> Optional[str]:
670+
def collect_log_remotely(node, cmd: str, log_filename: str) -> Tuple[Optional[str], bool]:
637671
if not node.remoter:
638-
return None
639-
collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
640-
node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
641-
result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
642-
return log_filename if result.ok else None
672+
return None, False
673+
674+
is_file_remote = True
675+
676+
if ssh_connected := node.remoter.is_up():
677+
try:
678+
collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
679+
node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
680+
result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
681+
ok = result.ok
682+
except (Libssh2_Failure, InvokeFailure):
683+
ssh_connected = False
684+
685+
# Check if node is AWS-based
686+
if not ssh_connected and (ssm_runner := node.aws_ssm_runner):
687+
LOGGER.info("Collecting Node %s via SSM: %s", node.name, log_filename)
688+
Path(log_filename).parent.mkdir(parents=True, exist_ok=True)
689+
690+
# Use SSM to run the command and save it to a local file
691+
is_file_remote = False
692+
693+
try:
694+
collect_log_command = f"{cmd}"
695+
ssm_result = ssm_runner.run_command_and_save_output(
696+
command=collect_log_command,
697+
local_output_file=log_filename,
698+
comment=f'Collect log {log_filename}',
699+
ignore_status=True
700+
)
701+
ok = ssm_result.ok
702+
if not ssm_result.ok:
703+
LOGGER.error("SSM command failed for instance %s: %s ",
704+
node._instance.get("InstanceId"), collect_log_command)
705+
return None, is_file_remote
706+
except (ImportError, AttributeError, TypeError, ValueError, KeyError, IndexError) as e:
707+
LOGGER.error("Failed to run SSM command: %s", e)
708+
return None, is_file_remote
709+
return log_filename if ok else None, is_file_remote
643710

644711
@staticmethod
645712
def archive_log_remotely(node, log_filename: str, archive_name: Optional[str] = None) -> Optional[str]:

sdcm/provision/aws/configuration_script.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
#
1212
# Copyright (c) 2021 ScyllaDB
1313

14-
from sdcm.provision.aws.utils import network_config_ipv6_workaround_script
14+
from sdcm.provision.aws.utils import (
15+
network_config_ipv6_workaround_script,
16+
enable_ssm_agent_script,
17+
)
1518
from sdcm.provision.common.configuration_script import ConfigurationScriptBuilder
1619

1720

@@ -26,7 +29,8 @@ def _wait_before_running_script(self) -> str:
2629
return 'while ! systemctl status cloud-init.service | grep "active (exited)"; do sleep 1; done\n'
2730

2831
def _script_body(self) -> str:
29-
script = super()._script_body()
32+
script = enable_ssm_agent_script()
33+
script += super()._script_body()
3034
if self.aws_ipv6_workaround:
3135
script += network_config_ipv6_workaround_script()
3236
return script

sdcm/provision/aws/utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,17 @@ def network_config_ipv6_workaround_script():
292292
""")
293293

294294

295+
def enable_ssm_agent_script():
296+
"""Our images come with it masked by default. For testing we want this for debugging purposes, especially when we can't have SSH connectivity."""
297+
return dedent(r"""
298+
if ! systemctl is-active --quiet amazon-ssm-agent; then
299+
systemctl unmask amazon-ssm-agent
300+
systemctl enable amazon-ssm-agent
301+
systemctl start amazon-ssm-agent
302+
fi
303+
""")
304+
305+
295306
def configure_set_preserve_hostname_script():
296307
return 'grep "preserve_hostname: true" /etc/cloud/cloud.cfg 1>/dev/null 2>&1 ' \
297308
'|| echo "preserve_hostname: true" >> /etc/cloud/cloud.cfg\n'

sdcm/utils/aws_region.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class AwsRegion:
3737
SCT_KEY_PAIR_NAME = "scylla_test_id_ed25519" # TODO: change legacy name to sct-keypair-aws
3838
SCT_SSH_GROUP_NAME = 'SCT-ssh-sg'
3939
SCT_SUBNET_PER_AZ = 2 # how many subnets to configure in each region.
40+
SCT_NODES_ROLE_ARN = "qa-scylla-manager-backup-role"
4041

4142
def __init__(self, region_name):
4243
self.region_name = region_name
@@ -651,6 +652,34 @@ def get_vpc_peering_routes(self) -> list[str]:
651652
LOGGER.debug("Discovered %s VPC peering routes in %s", peering_routes, self.region_name)
652653
return peering_routes
653654

655+
@cached_property
656+
def ssm(self):
657+
return boto3.client("ssm", region_name=self.region_name)
658+
659+
@cached_property
660+
def sts(self):
661+
return boto3.client("sts", region_name=self.region_name)
662+
663+
def configure_ssm(self, role_name=SCT_NODES_ROLE_ARN):
664+
"""Ensure that SSM agent can work in the region by adding necessary IAM role and instance profile"""
665+
666+
# Replace with the actual ARN of your IAM role created in step 1
667+
# Example: service-role/AWSSystemsManagerDefaultEC2InstanceManagementRole
668+
# Note: The 'service-role/' prefix is crucial when setting the value.
669+
iam_role_for_dhmc = f"service-role/{role_name}"
670+
671+
account_id = self.sts.get_caller_identity()["Account"]
672+
region = self.ssm.meta.region_name
673+
674+
setting_id = f"arn:aws:ssm:{region}:{account_id}:servicesetting/ssm/managed-instance/default-ec2-instance-management-role"
675+
676+
try:
677+
response = self.ssm.update_service_setting(SettingId=setting_id, SettingValue=iam_role_for_dhmc)
678+
LOGGER.info("Default Host Management Configuration updated successfully.")
679+
LOGGER.debug(response)
680+
except botocore.exceptions.ClientError as e:
681+
LOGGER.error(f"Error updating Default Host Management Configuration: {e}")
682+
654683
def configure(self):
655684
LOGGER.info("Configuring '%s' region...", self.region_name)
656685
self.create_sct_vpc()
@@ -660,4 +689,5 @@ def configure(self):
660689
self.create_sct_security_group()
661690
self.create_sct_ssh_security_group()
662691
self.create_sct_key_pair()
692+
self.configure_ssm()
663693
LOGGER.info("Region configured successfully.")

0 commit comments

Comments
 (0)