Skip to content

Commit 82d900e

Browse files
fruchCopilot
authored andcommitted
feature(aws): log fallback to ssm based access
since we are running into some issue we are failing to have ssh access but zero logs cause of it (we have multiple time during the years, credentails is or other cloud-init/boot issues) in this change we are gonna make sure ssm-agents are working on our instances, and fallback to log during log collection if we can't have ssh access * added it to the regio configuration to enable it * added it the top of the cloud-init to unmask the agent see: scylladb/scylla-machine-image@b8e494d * `SSMCommandRunner` which have `run()` api as with our ssh based remoters * `CommandLog` collection is falling back to use `SSMCommandRunner` Ref: #11581 Update sdcm/utils/aws_region.py Co-authored-by: Copilot <[email protected]> Update sdcm/provision/aws/utils.py Co-authored-by: Copilot <[email protected]> Update sdcm/utils/aws_ssm_runner.py Co-authored-by: Copilot <[email protected]> (cherry picked from commit 5aad56a)
1 parent 4e955c8 commit 82d900e

File tree

6 files changed

+665
-26
lines changed

6 files changed

+665
-26
lines changed

sdcm/logcollector.py

Lines changed: 99 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
from functools import cached_property
2828

2929
import requests
30-
from invoke.exceptions import UnexpectedExit
30+
from invoke.exceptions import UnexpectedExit, Failure as InvokeFailure
31+
from botocore.exceptions import ClientError
3132

3233
import sdcm.monitorstack.ui as monitoring_ui
3334
from sdcm.paths import SCYLLA_YAML_PATH, SCYLLA_PROPERTIES_PATH, SCYLLA_MANAGER_AGENT_YAML_PATH, \
@@ -62,6 +63,16 @@
6263
from sdcm.utils.k8s import KubernetesOps
6364
from sdcm.utils.s3_remote_uploader import upload_remote_files_directly_to_s3
6465
from sdcm.utils.gce_utils import gce_public_addresses, gce_private_addresses
66+
<<<<<<< HEAD
67+
||||||| parent of 5aad56a49 (feature(aws): log fallback to ssm based access)
68+
from sdcm.localhost import LocalHost
69+
from sdcm.cloud_api_client import ScyllaCloudAPIClient
70+
=======
71+
from sdcm.localhost import LocalHost
72+
from sdcm.cloud_api_client import ScyllaCloudAPIClient
73+
from sdcm.utils.aws_ssm_runner import SSMCommandRunner
74+
from sdcm.remote.libssh2_client.exceptions import Failure as Libssh2_Failure
75+
>>>>>>> 5aad56a49 (feature(aws): log fallback to ssm based access)
6576

6677
LOGGER = logging.getLogger(__name__)
6778

@@ -94,6 +105,17 @@ def distro(self):
94105
LOGGER.info("Detected Linux distribution: %s", _distro.name)
95106
return _distro
96107

108+
@cached_property
109+
def is_aws(self) -> bool:
110+
return isinstance(self._instance, dict) and self._instance.get("InstanceId") is not None
111+
112+
@cached_property
113+
def aws_ssm_runner(self) -> Optional[SSMCommandRunner]:
114+
if self.is_aws:
115+
region = self._instance.get("Placement").get("AvailabilityZone")[:-1]
116+
return SSMCommandRunner(region_name=region, instance_id=self._instance.get("InstanceId"))
117+
return None
118+
97119
@retrying(n=30, sleep_time=15, allowed_exceptions=(UnexpectedExit, Libssh2_UnexpectedExit,))
98120
def install_package(self,
99121
package_name: str,
@@ -214,18 +236,24 @@ class CommandLog(BaseLogEntity):
214236
def collect(self, node, local_dst, remote_dst=None, local_search_path=None) -> Optional[str]:
215237
if not node or not node.remoter or remote_dst is None:
216238
return None
217-
remote_logfile = LogCollector.collect_log_remotely(node=node,
218-
cmd=self.cmd,
219-
log_filename=os.path.join(remote_dst, self.name))
239+
240+
remote_logfile, is_file_remote = LogCollector.collect_log_remotely(node=node,
241+
cmd=self.cmd,
242+
log_filename=os.path.join(remote_dst, self.name))
220243
if not remote_logfile:
221244
LOGGER.warning(
222245
"Nothing to collect. Command '%s' did not prepare log file on remote host '%s'", self.cmd, node.name)
223246
return None
224-
LogCollector.receive_log(node=node,
225-
remote_log_path=remote_logfile,
226-
local_dir=local_dst,
227-
timeout=self.collect_timeout)
228-
return os.path.join(local_dst, os.path.basename(remote_logfile))
247+
local_path = Path(local_dst) / Path(remote_logfile).name
248+
if is_file_remote:
249+
LogCollector.receive_log(node=node,
250+
remote_log_path=remote_logfile,
251+
local_dir=local_dst,
252+
timeout=self.collect_timeout)
253+
else:
254+
# copy locally
255+
shutil.copyfile(remote_logfile, str(local_path))
256+
return str(local_path)
229257

230258

231259
class FileLog(CommandLog):
@@ -614,29 +642,76 @@ def create_local_storage_dir(self, base_local_dir):
614642
def create_remote_storage_dir(self, node, path=''):
615643
if not path:
616644
path = node.name
617-
try:
618-
remote_dir = os.path.join(self.node_remote_dir, path)
619-
result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
645+
remote_dir = os.path.join(self.node_remote_dir, path)
620646

621-
if result.exited > 0:
622-
LOGGER.error(
623-
'Remote storing folder not created.\n{}'.format(result))
624-
remote_dir = self.node_remote_dir
647+
if ssh_connected := node.remoter.is_up():
625648

626-
except Exception as details: # noqa: BLE001
627-
LOGGER.error("Error during creating remote directory %s", details)
649+
try:
650+
result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
651+
652+
if result.exited > 0:
653+
LOGGER.error(
654+
'Remote storing folder not created.\n{}'.format(result))
655+
remote_dir = self.node_remote_dir
656+
657+
except (Libssh2_Failure, InvokeFailure) as details:
658+
LOGGER.error("Error during creating remote directory %s", details)
659+
elif not ssh_connected and (ssm_runner := node.aws_ssm_runner):
660+
try:
661+
ssm_result = ssm_runner.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
662+
ok = ssm_result.ok
663+
if not ok:
664+
LOGGER.error("SSM command failed for instance %s: mkdir", node._instance.get("InstanceId"))
665+
666+
except (ClientError, AttributeError) as e:
667+
LOGGER.error("Failed to run SSM command: %s", e)
668+
remote_dir = self.node_remote_dir
669+
else:
628670
remote_dir = self.node_remote_dir
629671

630672
return remote_dir
631673

632674
@staticmethod
633-
def collect_log_remotely(node, cmd: str, log_filename: str) -> Optional[str]:
675+
def collect_log_remotely(node, cmd: str, log_filename: str) -> Tuple[Optional[str], bool]:
634676
if not node.remoter:
635-
return None
636-
collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
637-
node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
638-
result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
639-
return log_filename if result.ok else None
677+
return None, False
678+
679+
is_file_remote = True
680+
681+
if ssh_connected := node.remoter.is_up():
682+
try:
683+
collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
684+
node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
685+
result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
686+
ok = result.ok
687+
except (Libssh2_Failure, InvokeFailure):
688+
ssh_connected = False
689+
690+
# Check if node is AWS-based
691+
if not ssh_connected and (ssm_runner := node.aws_ssm_runner):
692+
LOGGER.info("Collecting Node %s via SSM: %s", node.name, log_filename)
693+
Path(log_filename).parent.mkdir(parents=True, exist_ok=True)
694+
695+
# Use SSM to run the command and save it to a local file
696+
is_file_remote = False
697+
698+
try:
699+
collect_log_command = f"{cmd}"
700+
ssm_result = ssm_runner.run_command_and_save_output(
701+
command=collect_log_command,
702+
local_output_file=log_filename,
703+
comment=f'Collect log {log_filename}',
704+
ignore_status=True
705+
)
706+
ok = ssm_result.ok
707+
if not ssm_result.ok:
708+
LOGGER.error("SSM command failed for instance %s: %s ",
709+
node._instance.get("InstanceId"), collect_log_command)
710+
return None, is_file_remote
711+
except (ImportError, AttributeError, TypeError, ValueError, KeyError, IndexError) as e:
712+
LOGGER.error("Failed to run SSM command: %s", e)
713+
return None, is_file_remote
714+
return log_filename if ok else None, is_file_remote
640715

641716
@staticmethod
642717
def archive_log_remotely(node, log_filename: str, archive_name: Optional[str] = None) -> Optional[str]:

sdcm/provision/aws/configuration_script.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
#
1212
# Copyright (c) 2021 ScyllaDB
1313

14-
from sdcm.provision.aws.utils import network_config_ipv6_workaround_script
14+
from sdcm.provision.aws.utils import (
15+
network_config_ipv6_workaround_script,
16+
enable_ssm_agent_script,
17+
)
1518
from sdcm.provision.common.configuration_script import ConfigurationScriptBuilder
1619

1720

@@ -26,7 +29,8 @@ def _wait_before_running_script(self) -> str:
2629
return 'while ! systemctl status cloud-init.service | grep "active (exited)"; do sleep 1; done\n'
2730

2831
def _script_body(self) -> str:
29-
script = super()._script_body()
32+
script = enable_ssm_agent_script()
33+
script += super()._script_body()
3034
if self.aws_ipv6_workaround:
3135
script += network_config_ipv6_workaround_script()
3236
return script

sdcm/provision/aws/utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,17 @@ def network_config_ipv6_workaround_script():
292292
""")
293293

294294

295+
def enable_ssm_agent_script():
296+
"""Our images come with it masked by default. For testing we want this for debugging purposes, especially when we can't have SSH connectivity."""
297+
return dedent(r"""
298+
if ! systemctl is-active --quiet amazon-ssm-agent; then
299+
systemctl unmask amazon-ssm-agent
300+
systemctl enable amazon-ssm-agent
301+
systemctl start amazon-ssm-agent
302+
fi
303+
""")
304+
305+
295306
def configure_set_preserve_hostname_script():
296307
return 'grep "preserve_hostname: true" /etc/cloud/cloud.cfg 1>/dev/null 2>&1 ' \
297308
'|| echo "preserve_hostname: true" >> /etc/cloud/cloud.cfg\n'

sdcm/utils/aws_region.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class AwsRegion:
3737
SCT_KEY_PAIR_NAME = "scylla_test_id_ed25519" # TODO: change legacy name to sct-keypair-aws
3838
SCT_SSH_GROUP_NAME = 'SCT-ssh-sg'
3939
SCT_SUBNET_PER_AZ = 2 # how many subnets to configure in each region.
40+
SCT_NODES_ROLE_ARN = "qa-scylla-manager-backup-role"
4041

4142
def __init__(self, region_name):
4243
self.region_name = region_name
@@ -615,6 +616,66 @@ def create_sct_key_pair(self):
615616
PublicKeyMaterial=sct_key_pair.public_key)
616617
LOGGER.info("SCT Key Pair created.")
617618

619+
<<<<<<< HEAD
620+
||||||| parent of 5aad56a49 (feature(aws): log fallback to ssm based access)
621+
def get_vpc_peering_routes(self) -> list[str]:
622+
"""Discover all VPC peering routes in SCT route tables"""
623+
peering_routes = []
624+
for route_table in self.sct_route_tables:
625+
routes = route_table.routes_attribute
626+
for route in routes:
627+
if route.get('VpcPeeringConnectionId') and route.get('DestinationCidrBlock'):
628+
dest_cidr = route.get('DestinationCidrBlock')
629+
if dest_cidr != '0.0.0.0/0' and dest_cidr not in peering_routes:
630+
peering_routes.append(dest_cidr)
631+
632+
LOGGER.debug("Discovered %s VPC peering routes in %s", peering_routes, self.region_name)
633+
return peering_routes
634+
635+
=======
636+
def get_vpc_peering_routes(self) -> list[str]:
637+
"""Discover all VPC peering routes in SCT route tables"""
638+
peering_routes = []
639+
for route_table in self.sct_route_tables:
640+
routes = route_table.routes_attribute
641+
for route in routes:
642+
if route.get('VpcPeeringConnectionId') and route.get('DestinationCidrBlock'):
643+
dest_cidr = route.get('DestinationCidrBlock')
644+
if dest_cidr != '0.0.0.0/0' and dest_cidr not in peering_routes:
645+
peering_routes.append(dest_cidr)
646+
647+
LOGGER.debug("Discovered %s VPC peering routes in %s", peering_routes, self.region_name)
648+
return peering_routes
649+
650+
@cached_property
651+
def ssm(self):
652+
return boto3.client("ssm", region_name=self.region_name)
653+
654+
@cached_property
655+
def sts(self):
656+
return boto3.client("sts", region_name=self.region_name)
657+
658+
def configure_ssm(self, role_name=SCT_NODES_ROLE_ARN):
659+
"""Ensure that SSM agent can work in the region by adding necessary IAM role and instance profile"""
660+
661+
# Replace with the actual ARN of your IAM role created in step 1
662+
# Example: service-role/AWSSystemsManagerDefaultEC2InstanceManagementRole
663+
# Note: The 'service-role/' prefix is crucial when setting the value.
664+
iam_role_for_dhmc = f"service-role/{role_name}"
665+
666+
account_id = self.sts.get_caller_identity()["Account"]
667+
region = self.ssm.meta.region_name
668+
669+
setting_id = f"arn:aws:ssm:{region}:{account_id}:servicesetting/ssm/managed-instance/default-ec2-instance-management-role"
670+
671+
try:
672+
response = self.ssm.update_service_setting(SettingId=setting_id, SettingValue=iam_role_for_dhmc)
673+
LOGGER.info("Default Host Management Configuration updated successfully.")
674+
LOGGER.debug(response)
675+
except botocore.exceptions.ClientError as e:
676+
LOGGER.error(f"Error updating Default Host Management Configuration: {e}")
677+
678+
>>>>>>> 5aad56a49 (feature(aws): log fallback to ssm based access)
618679
def configure(self):
619680
LOGGER.info("Configuring '%s' region...", self.region_name)
620681
self.create_sct_vpc()
@@ -624,4 +685,5 @@ def configure(self):
624685
self.create_sct_security_group()
625686
self.create_sct_ssh_security_group()
626687
self.create_sct_key_pair()
688+
self.configure_ssm()
627689
LOGGER.info("Region configured successfully.")

0 commit comments

Comments
 (0)