Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 91 additions & 24 deletions sdcm/logcollector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from dataclasses import dataclass

import requests
from invoke.exceptions import UnexpectedExit
from invoke.exceptions import UnexpectedExit, Failure as InvokeFailure
from botocore.exceptions import ClientError

import sdcm.monitorstack.ui as monitoring_ui
from sdcm.paths import SCYLLA_YAML_PATH, SCYLLA_PROPERTIES_PATH, SCYLLA_MANAGER_AGENT_YAML_PATH, \
Expand Down Expand Up @@ -65,6 +66,8 @@
from sdcm.utils.gce_utils import gce_public_addresses, gce_private_addresses
from sdcm.localhost import LocalHost
from sdcm.cloud_api_client import ScyllaCloudAPIClient
from sdcm.utils.aws_ssm_runner import SSMCommandRunner
from sdcm.remote.libssh2_client.exceptions import Failure as Libssh2_Failure

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -97,6 +100,17 @@ def distro(self):
LOGGER.info("Detected Linux distribution: %s", _distro.name)
return _distro

@cached_property
def is_aws(self) -> bool:
return isinstance(self._instance, dict) and self._instance.get("InstanceId") is not None

@cached_property
def aws_ssm_runner(self) -> Optional[SSMCommandRunner]:
if self.is_aws:
region = self._instance.get("Placement").get("AvailabilityZone")[:-1]
return SSMCommandRunner(region_name=region, instance_id=self._instance.get("InstanceId"))
return None

@retrying(n=30, sleep_time=15, allowed_exceptions=(UnexpectedExit, Libssh2_UnexpectedExit,))
def install_package(self,
package_name: str,
Expand Down Expand Up @@ -217,18 +231,24 @@ class CommandLog(BaseLogEntity):
def collect(self, node, local_dst, remote_dst=None, local_search_path=None) -> Optional[str]:
if not node or not node.remoter or remote_dst is None:
return None
remote_logfile = LogCollector.collect_log_remotely(node=node,
cmd=self.cmd,
log_filename=os.path.join(remote_dst, self.name))

remote_logfile, is_file_remote = LogCollector.collect_log_remotely(node=node,
cmd=self.cmd,
log_filename=os.path.join(remote_dst, self.name))
if not remote_logfile:
LOGGER.warning(
"Nothing to collect. Command '%s' did not prepare log file on remote host '%s'", self.cmd, node.name)
return None
LogCollector.receive_log(node=node,
remote_log_path=remote_logfile,
local_dir=local_dst,
timeout=self.collect_timeout)
return os.path.join(local_dst, os.path.basename(remote_logfile))
local_path = Path(local_dst) / Path(remote_logfile).name
if is_file_remote:
LogCollector.receive_log(node=node,
remote_log_path=remote_logfile,
local_dir=local_dst,
timeout=self.collect_timeout)
else:
# copy locally
shutil.copyfile(remote_logfile, str(local_path))
return str(local_path)


class FileLog(CommandLog):
Expand Down Expand Up @@ -617,29 +637,76 @@ def create_local_storage_dir(self, base_local_dir):
def create_remote_storage_dir(self, node, path=''):
if not path:
path = node.name
try:
remote_dir = os.path.join(self.node_remote_dir, path)
result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
remote_dir = os.path.join(self.node_remote_dir, path)

if result.exited > 0:
LOGGER.error(
'Remote storing folder not created.\n{}'.format(result))
remote_dir = self.node_remote_dir
if ssh_connected := node.remoter.is_up():

except Exception as details: # noqa: BLE001
LOGGER.error("Error during creating remote directory %s", details)
try:
result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)

if result.exited > 0:
LOGGER.error(
'Remote storing folder not created.\n{}'.format(result))
remote_dir = self.node_remote_dir

except (Libssh2_Failure, InvokeFailure) as details:
LOGGER.error("Error during creating remote directory %s", details)
elif not ssh_connected and (ssm_runner := node.aws_ssm_runner):
try:
ssm_result = ssm_runner.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
ok = ssm_result.ok
if not ok:
LOGGER.error("SSM command failed for instance %s: mkdir", node._instance.get("InstanceId"))

except (ClientError, AttributeError) as e:
LOGGER.error("Failed to run SSM command: %s", e)
remote_dir = self.node_remote_dir
else:
remote_dir = self.node_remote_dir

return remote_dir

@staticmethod
def collect_log_remotely(node, cmd: str, log_filename: str) -> Optional[str]:
def collect_log_remotely(node, cmd: str, log_filename: str) -> Tuple[Optional[str], bool]:
if not node.remoter:
return None
collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
return log_filename if result.ok else None
return None, False

is_file_remote = True

if ssh_connected := node.remoter.is_up():
try:
collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
ok = result.ok
except (Libssh2_Failure, InvokeFailure):
ssh_connected = False

# Check if node is AWS-based
if not ssh_connected and (ssm_runner := node.aws_ssm_runner):
LOGGER.info("Collecting Node %s via SSM: %s", node.name, log_filename)
Path(log_filename).parent.mkdir(parents=True, exist_ok=True)

# Use SSM to run the command and save it to a local file
is_file_remote = False

try:
collect_log_command = f"{cmd}"
ssm_result = ssm_runner.run_command_and_save_output(
command=collect_log_command,
local_output_file=log_filename,
comment=f'Collect log {log_filename}',
ignore_status=True
)
ok = ssm_result.ok
if not ssm_result.ok:
LOGGER.error("SSM command failed for instance %s: %s ",
node._instance.get("InstanceId"), collect_log_command)
return None, is_file_remote
except (ImportError, AttributeError, TypeError, ValueError, KeyError, IndexError) as e:
LOGGER.error("Failed to run SSM command: %s", e)
return None, is_file_remote
return log_filename if ok else None, is_file_remote
Copy link

Copilot AI Oct 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable ok is only defined within the try-except blocks (lines 681 and 701), but it's referenced outside those blocks on line 709. If an exception occurs in line 682-683 before ok is assigned, or if neither SSH nor SSM paths execute, this will raise an UnboundLocalError. Initialize ok = False at the start of the method to ensure it always has a value.

Copilot uses AI. Check for mistakes.

@staticmethod
def archive_log_remotely(node, log_filename: str, archive_name: Optional[str] = None) -> Optional[str]:
Expand Down
8 changes: 6 additions & 2 deletions sdcm/provision/aws/configuration_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
#
# Copyright (c) 2021 ScyllaDB

from sdcm.provision.aws.utils import network_config_ipv6_workaround_script
from sdcm.provision.aws.utils import (
network_config_ipv6_workaround_script,
enable_ssm_agent_script,
)
from sdcm.provision.common.configuration_script import ConfigurationScriptBuilder


Expand All @@ -26,7 +29,8 @@ def _wait_before_running_script(self) -> str:
return 'while ! systemctl status cloud-init.service | grep "active (exited)"; do sleep 1; done\n'

def _script_body(self) -> str:
script = super()._script_body()
script = enable_ssm_agent_script()
script += super()._script_body()
if self.aws_ipv6_workaround:
script += network_config_ipv6_workaround_script()
return script
11 changes: 11 additions & 0 deletions sdcm/provision/aws/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,17 @@ def network_config_ipv6_workaround_script():
""")


def enable_ssm_agent_script():
"""Our images come with it masked by default. For testing we want this for debugging purposes, especially when we can't have SSH connectivity."""
return dedent(r"""
if ! systemctl is-active --quiet amazon-ssm-agent; then
systemctl unmask amazon-ssm-agent
systemctl enable amazon-ssm-agent
systemctl start amazon-ssm-agent
fi
""")


def configure_set_preserve_hostname_script():
return 'grep "preserve_hostname: true" /etc/cloud/cloud.cfg 1>/dev/null 2>&1 ' \
'|| echo "preserve_hostname: true" >> /etc/cloud/cloud.cfg\n'
Expand Down
30 changes: 30 additions & 0 deletions sdcm/utils/aws_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class AwsRegion:
SCT_KEY_PAIR_NAME = "scylla_test_id_ed25519" # TODO: change legacy name to sct-keypair-aws
SCT_SSH_GROUP_NAME = 'SCT-ssh-sg'
SCT_SUBNET_PER_AZ = 2 # how many subnets to configure in each region.
SCT_NODES_ROLE_ARN = "qa-scylla-manager-backup-role"

def __init__(self, region_name):
self.region_name = region_name
Expand Down Expand Up @@ -651,6 +652,34 @@ def get_vpc_peering_routes(self) -> list[str]:
LOGGER.debug("Discovered %s VPC peering routes in %s", peering_routes, self.region_name)
return peering_routes

@cached_property
def ssm(self):
return boto3.client("ssm", region_name=self.region_name)

@cached_property
def sts(self):
return boto3.client("sts", region_name=self.region_name)

def configure_ssm(self, role_name=SCT_NODES_ROLE_ARN):
"""Ensure that SSM agent can work in the region by adding necessary IAM role and instance profile"""

# Replace with the actual ARN of your IAM role created in step 1
# Example: service-role/AWSSystemsManagerDefaultEC2InstanceManagementRole
# Note: The 'service-role/' prefix is crucial when setting the value.
iam_role_for_dhmc = f"service-role/{role_name}"

account_id = self.sts.get_caller_identity()["Account"]
region = self.ssm.meta.region_name

setting_id = f"arn:aws:ssm:{region}:{account_id}:servicesetting/ssm/managed-instance/default-ec2-instance-management-role"

try:
response = self.ssm.update_service_setting(SettingId=setting_id, SettingValue=iam_role_for_dhmc)
LOGGER.info("Default Host Management Configuration updated successfully.")
LOGGER.debug(response)
except botocore.exceptions.ClientError as e:
LOGGER.error(f"Error updating Default Host Management Configuration: {e}")

def configure(self):
LOGGER.info("Configuring '%s' region...", self.region_name)
self.create_sct_vpc()
Expand All @@ -660,4 +689,5 @@ def configure(self):
self.create_sct_security_group()
self.create_sct_ssh_security_group()
self.create_sct_key_pair()
self.configure_ssm()
LOGGER.info("Region configured successfully.")
Loading
Loading