From 2f34375069540735001d659ca05cd6262d5131aa Mon Sep 17 00:00:00 2001
From: Israel Fruchter <fruch@scylladb.com>
Date: Thu, 23 Oct 2025 13:18:13 +0300
Subject: [PATCH] feature(aws): log fallback to ssm based access

since we are running into some issue we are failing to have ssh
access but zero logs cause of it (we have multiple time during the years,
credentails is or other cloud-init/boot issues)

in this change we are gonna make sure ssm-agents are working on our instances,
and fallback to log during log collection if we can't have ssh access

* added it to the regio configuration to enable it
* added it the top of the cloud-init to unmask the agent
  see: https://github.com/scylladb/scylla-machine-image/commit/b8e494d0aacb6d2b4853a132e838851e53a6407c
* `SSMCommandRunner` which have `run()` api as with our ssh based remoters
* `CommandLog` collection is falling back to use `SSMCommandRunner`

Ref: #11581

Update sdcm/utils/aws_region.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

Update sdcm/provision/aws/utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

Update sdcm/utils/aws_ssm_runner.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 sdcm/logcollector.py                       | 115 ++++++--
 sdcm/provision/aws/configuration_script.py |   8 +-
 sdcm/provision/aws/utils.py                |  11 +
 sdcm/utils/aws_region.py                   |  30 ++
 sdcm/utils/aws_ssm_runner.py               | 304 +++++++++++++++++++++
 unit_tests/test_aws_ssm_runner.py          | 183 +++++++++++++
 6 files changed, 625 insertions(+), 26 deletions(-)
 create mode 100644 sdcm/utils/aws_ssm_runner.py
 create mode 100644 unit_tests/test_aws_ssm_runner.py

diff --git a/sdcm/logcollector.py b/sdcm/logcollector.py
index a4b7cc5bef5..5e7bf206ef4 100644
--- a/sdcm/logcollector.py
+++ b/sdcm/logcollector.py
@@ -28,7 +28,8 @@
 from dataclasses import dataclass
 
 import requests
-from invoke.exceptions import UnexpectedExit
+from invoke.exceptions import UnexpectedExit, Failure as InvokeFailure
+from botocore.exceptions import ClientError
 
 import sdcm.monitorstack.ui as monitoring_ui
 from sdcm.paths import SCYLLA_YAML_PATH, SCYLLA_PROPERTIES_PATH, SCYLLA_MANAGER_AGENT_YAML_PATH, \
@@ -65,6 +66,8 @@
 from sdcm.utils.gce_utils import gce_public_addresses, gce_private_addresses
 from sdcm.localhost import LocalHost
 from sdcm.cloud_api_client import ScyllaCloudAPIClient
+from sdcm.utils.aws_ssm_runner import SSMCommandRunner
+from sdcm.remote.libssh2_client.exceptions import Failure as Libssh2_Failure
 
 LOGGER = logging.getLogger(__name__)
 
@@ -97,6 +100,17 @@ def distro(self):
         LOGGER.info("Detected Linux distribution: %s", _distro.name)
         return _distro
 
+    @cached_property
+    def is_aws(self) -> bool:
+        return isinstance(self._instance, dict) and self._instance.get("InstanceId") is not None
+
+    @cached_property
+    def aws_ssm_runner(self) -> Optional[SSMCommandRunner]:
+        if self.is_aws:
+            region = self._instance.get("Placement").get("AvailabilityZone")[:-1]
+            return SSMCommandRunner(region_name=region, instance_id=self._instance.get("InstanceId"))
+        return None
+
     @retrying(n=30, sleep_time=15, allowed_exceptions=(UnexpectedExit, Libssh2_UnexpectedExit,))
     def install_package(self,
                         package_name: str,
@@ -217,18 +231,24 @@ class CommandLog(BaseLogEntity):
     def collect(self, node, local_dst, remote_dst=None, local_search_path=None) -> Optional[str]:
         if not node or not node.remoter or remote_dst is None:
             return None
-        remote_logfile = LogCollector.collect_log_remotely(node=node,
-                                                           cmd=self.cmd,
-                                                           log_filename=os.path.join(remote_dst, self.name))
+
+        remote_logfile, is_file_remote = LogCollector.collect_log_remotely(node=node,
+                                                                           cmd=self.cmd,
+                                                                           log_filename=os.path.join(remote_dst, self.name))
         if not remote_logfile:
             LOGGER.warning(
                 "Nothing to collect. Command '%s' did not prepare log file on remote host '%s'", self.cmd, node.name)
             return None
-        LogCollector.receive_log(node=node,
-                                 remote_log_path=remote_logfile,
-                                 local_dir=local_dst,
-                                 timeout=self.collect_timeout)
-        return os.path.join(local_dst, os.path.basename(remote_logfile))
+        local_path = Path(local_dst) / Path(remote_logfile).name
+        if is_file_remote:
+            LogCollector.receive_log(node=node,
+                                     remote_log_path=remote_logfile,
+                                     local_dir=local_dst,
+                                     timeout=self.collect_timeout)
+        else:
+            # copy locally
+            shutil.copyfile(remote_logfile, str(local_path))
+        return str(local_path)
 
 
 class FileLog(CommandLog):
@@ -617,29 +637,76 @@ def create_local_storage_dir(self, base_local_dir):
     def create_remote_storage_dir(self, node, path=''):
         if not path:
             path = node.name
-        try:
-            remote_dir = os.path.join(self.node_remote_dir, path)
-            result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
+        remote_dir = os.path.join(self.node_remote_dir, path)
 
-            if result.exited > 0:
-                LOGGER.error(
-                    'Remote storing folder not created.\n{}'.format(result))
-                remote_dir = self.node_remote_dir
+        if ssh_connected := node.remoter.is_up():
 
-        except Exception as details:  # noqa: BLE001
-            LOGGER.error("Error during creating remote directory %s", details)
+            try:
+                result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
+
+                if result.exited > 0:
+                    LOGGER.error(
+                        'Remote storing folder not created.\n{}'.format(result))
+                    remote_dir = self.node_remote_dir
+
+            except (Libssh2_Failure, InvokeFailure) as details:
+                LOGGER.error("Error during creating remote directory %s", details)
+        elif not ssh_connected and (ssm_runner := node.aws_ssm_runner):
+            try:
+                ssm_result = ssm_runner.run('mkdir -p {}'.format(remote_dir), ignore_status=True)
+                ok = ssm_result.ok
+                if not ok:
+                    LOGGER.error("SSM command failed for instance %s: mkdir", node._instance.get("InstanceId"))
+
+            except (ClientError, AttributeError) as e:
+                LOGGER.error("Failed to run SSM command: %s", e)
+                remote_dir = self.node_remote_dir
+        else:
             remote_dir = self.node_remote_dir
 
         return remote_dir
 
     @staticmethod
-    def collect_log_remotely(node, cmd: str, log_filename: str) -> Optional[str]:
+    def collect_log_remotely(node, cmd: str, log_filename: str) -> Tuple[Optional[str], bool]:
         if not node.remoter:
-            return None
-        collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
-        node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
-        result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
-        return log_filename if result.ok else None
+            return None, False
+
+        is_file_remote = True
+
+        if ssh_connected := node.remoter.is_up():
+            try:
+                collect_log_command = f"{cmd} > '{log_filename}' 2>&1"
+                node.remoter.run(collect_log_command, ignore_status=True, verbose=True)
+                result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True)
+                ok = result.ok
+            except (Libssh2_Failure, InvokeFailure):
+                ssh_connected = False
+
+        # Check if node is AWS-based
+        if not ssh_connected and (ssm_runner := node.aws_ssm_runner):
+            LOGGER.info("Collecting Node %s via SSM: %s", node.name, log_filename)
+            Path(log_filename).parent.mkdir(parents=True, exist_ok=True)
+
+            # Use SSM to run the command and save it to a local file
+            is_file_remote = False
+
+            try:
+                collect_log_command = f"{cmd}"
+                ssm_result = ssm_runner.run_command_and_save_output(
+                    command=collect_log_command,
+                    local_output_file=log_filename,
+                    comment=f'Collect log {log_filename}',
+                    ignore_status=True
+                )
+                ok = ssm_result.ok
+                if not ssm_result.ok:
+                    LOGGER.error("SSM command failed for instance %s: %s ",
+                                 node._instance.get("InstanceId"), collect_log_command)
+                    return None, is_file_remote
+            except (ImportError, AttributeError, TypeError, ValueError, KeyError, IndexError) as e:
+                LOGGER.error("Failed to run SSM command: %s", e)
+                return None, is_file_remote
+        return log_filename if ok else None, is_file_remote
 
     @staticmethod
     def archive_log_remotely(node, log_filename: str, archive_name: Optional[str] = None) -> Optional[str]:
diff --git a/sdcm/provision/aws/configuration_script.py b/sdcm/provision/aws/configuration_script.py
index c03977eca09..136111fe2e3 100644
--- a/sdcm/provision/aws/configuration_script.py
+++ b/sdcm/provision/aws/configuration_script.py
@@ -11,7 +11,10 @@
 #
 # Copyright (c) 2021 ScyllaDB
 
-from sdcm.provision.aws.utils import network_config_ipv6_workaround_script
+from sdcm.provision.aws.utils import (
+    network_config_ipv6_workaround_script,
+    enable_ssm_agent_script,
+)
 from sdcm.provision.common.configuration_script import ConfigurationScriptBuilder
 
 
@@ -26,7 +29,8 @@ def _wait_before_running_script(self) -> str:
         return 'while ! systemctl status cloud-init.service | grep "active (exited)"; do sleep 1; done\n'
 
     def _script_body(self) -> str:
-        script = super()._script_body()
+        script = enable_ssm_agent_script()
+        script += super()._script_body()
         if self.aws_ipv6_workaround:
             script += network_config_ipv6_workaround_script()
         return script
diff --git a/sdcm/provision/aws/utils.py b/sdcm/provision/aws/utils.py
index 63cd4ef246e..8dc0835f8c8 100644
--- a/sdcm/provision/aws/utils.py
+++ b/sdcm/provision/aws/utils.py
@@ -292,6 +292,17 @@ def network_config_ipv6_workaround_script():
     """)
 
 
+def enable_ssm_agent_script():
+    """Our images come with it masked by default. For testing we want this for debugging purposes, especially when we can't have SSH connectivity."""
+    return dedent(r"""
+        if ! systemctl is-active --quiet amazon-ssm-agent; then
+            systemctl unmask amazon-ssm-agent
+            systemctl enable amazon-ssm-agent
+            systemctl start amazon-ssm-agent
+        fi
+    """)
+
+
 def configure_set_preserve_hostname_script():
     return 'grep "preserve_hostname: true" /etc/cloud/cloud.cfg 1>/dev/null 2>&1 ' \
            '|| echo "preserve_hostname: true" >> /etc/cloud/cloud.cfg\n'
diff --git a/sdcm/utils/aws_region.py b/sdcm/utils/aws_region.py
index fae16abed81..545ad111e0b 100644
--- a/sdcm/utils/aws_region.py
+++ b/sdcm/utils/aws_region.py
@@ -37,6 +37,7 @@ class AwsRegion:
     SCT_KEY_PAIR_NAME = "scylla_test_id_ed25519"  # TODO: change legacy name to sct-keypair-aws
     SCT_SSH_GROUP_NAME = 'SCT-ssh-sg'
     SCT_SUBNET_PER_AZ = 2  # how many subnets to configure in each region.
+    SCT_NODES_ROLE_ARN = "qa-scylla-manager-backup-role"
 
     def __init__(self, region_name):
         self.region_name = region_name
@@ -651,6 +652,34 @@ def get_vpc_peering_routes(self) -> list[str]:
         LOGGER.debug("Discovered %s VPC peering routes in %s", peering_routes, self.region_name)
         return peering_routes
 
+    @cached_property
+    def ssm(self):
+        return boto3.client("ssm", region_name=self.region_name)
+
+    @cached_property
+    def sts(self):
+        return boto3.client("sts", region_name=self.region_name)
+
+    def configure_ssm(self, role_name=SCT_NODES_ROLE_ARN):
+        """Ensure that SSM agent can work in the region by adding necessary IAM role and instance profile"""
+
+        # Replace with the actual ARN of your IAM role created in step 1
+        # Example: service-role/AWSSystemsManagerDefaultEC2InstanceManagementRole
+        # Note: The 'service-role/' prefix is crucial when setting the value.
+        iam_role_for_dhmc = f"service-role/{role_name}"
+
+        account_id = self.sts.get_caller_identity()["Account"]
+        region = self.ssm.meta.region_name
+
+        setting_id = f"arn:aws:ssm:{region}:{account_id}:servicesetting/ssm/managed-instance/default-ec2-instance-management-role"
+
+        try:
+            response = self.ssm.update_service_setting(SettingId=setting_id, SettingValue=iam_role_for_dhmc)
+            LOGGER.info("Default Host Management Configuration updated successfully.")
+            LOGGER.debug(response)
+        except botocore.exceptions.ClientError as e:
+            LOGGER.error(f"Error updating Default Host Management Configuration: {e}")
+
     def configure(self):
         LOGGER.info("Configuring '%s' region...", self.region_name)
         self.create_sct_vpc()
@@ -660,4 +689,5 @@ def configure(self):
         self.create_sct_security_group()
         self.create_sct_ssh_security_group()
         self.create_sct_key_pair()
+        self.configure_ssm()
         LOGGER.info("Region configured successfully.")
diff --git a/sdcm/utils/aws_ssm_runner.py b/sdcm/utils/aws_ssm_runner.py
new file mode 100644
index 00000000000..c1472a6220a
--- /dev/null
+++ b/sdcm/utils/aws_ssm_runner.py
@@ -0,0 +1,304 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See LICENSE for more details.
+#
+# Copyright (c) 2025 ScyllaDB
+
+"""
+AWS Systems Manager (SSM) command runner utility.
+
+This module provides functionality to run commands on EC2 instances using
+AWS Systems Manager Run Command, replacing the AWS CLI with boto3 API calls.
+"""
+
+import logging
+import time
+from typing import Optional, Tuple, List
+import boto3
+from botocore.exceptions import ClientError
+from invoke.runners import Result
+from invoke.watchers import StreamWatcher
+
+LOGGER = logging.getLogger(__name__)
+
+
+class SSMCommandRunner:
+    """Run commands on EC2 instances using AWS Systems Manager."""
+
+    def __init__(self, region_name: str, instance_id: str):
+        """
+        Initialize SSM command runner.
+
+        Args:
+            region_name: AWS region name where the instance is located
+            instance_id: EC2 instance ID
+        """
+        self.region_name = region_name
+        self.instance_id = instance_id
+        self.ssm_client = boto3.client('ssm', region_name=region_name)
+        self.ec2_client = boto3.client('ec2', region_name=region_name)
+
+    def check_ssm_prerequisites(self) -> Tuple[bool, str]:
+        """
+        Check if the instance is ready for SSM commands.
+
+        Returns:
+            Tuple of (success: bool, message: str)
+        """
+        try:
+            # Check EC2 instance state
+            instance_data = self.ec2_client.describe_instances(InstanceIds=[self.instance_id])
+            instance_status = instance_data['Reservations'][0]['Instances'][0]['State']['Name']
+
+            if instance_status != 'running':
+                msg = f"Instance is in state '{instance_status}', must be 'running' for SSM commands"
+                LOGGER.warning(msg)
+                return False, msg
+
+            LOGGER.debug("Instance %s state check: SUCCESS (Status is '%s')", self.instance_id, instance_status)
+
+            # Check SSM Agent connectivity
+            ssm_info = self.ssm_client.describe_instance_information(
+                InstanceInformationFilterList=[
+                    {
+                        'key': 'InstanceIds',
+                        'valueSet': [self.instance_id]
+                    }
+                ]
+            )
+
+            info_list = ssm_info.get('InstanceInformationList', [])
+
+            if not info_list:
+                msg = ("SSM Agent is NOT reporting in. Check IAM Instance Profile includes "
+                       "AmazonSSMManagedInstanceCore policy")
+                LOGGER.warning(msg)
+                return False, msg
+
+            # Check Agent Ping Status
+            ping_status = info_list[0].get('PingStatus')
+
+            if ping_status != 'Online':
+                msg = f"SSM Agent status is '{ping_status}', must be 'Online'"
+                LOGGER.warning(msg)
+                return False, msg
+
+            LOGGER.debug("SSM Agent status for %s: SUCCESS (Status is '%s')", self.instance_id, ping_status)
+            return True, "SUCCESS"
+
+        except ClientError as e:
+            error_code = e.response['Error']['Code']
+            msg = f"AWS client error during SSM prerequisites check: {error_code} - {e}"
+            LOGGER.error(msg)
+            return False, msg
+        except (KeyError, IndexError) as e:
+            msg = f"Unexpected response format during SSM prerequisites check: {e}"
+            LOGGER.error(msg)
+            return False, msg
+
+    def run(
+        self,
+        cmd: str,
+        timeout: Optional[float] = 300,
+        ignore_status: bool = False,
+        verbose: bool = True,
+        log_file: Optional[str] = None,
+        retry: int = 1,
+        watchers: Optional[List[StreamWatcher]] = None,
+        comment: Optional[str] = None
+    ) -> Result:
+        """
+        Run a shell command on an EC2 instance using SSM.
+
+        Args:
+            cmd: Shell command to execute
+            timeout: Command execution timeout in seconds (default: 300)
+            ignore_status: If True, do not raise exception on command failure
+            verbose: If True, log command output
+            log_file: Optional file path to save command output
+            retry: Number of retry attempts (currently not implemented for SSM)
+            watchers: Stream watchers (not used for SSM)
+            comment: Optional comment for the SSM command
+
+        Returns:
+            Result object with stdout, stderr, exited (status code), ok, etc.
+        """
+        # Check prerequisites first
+        prerequisites_ok, prerequisites_msg = self.check_ssm_prerequisites()
+        if not prerequisites_ok:
+            LOGGER.error("SSM prerequisites check failed for %s: %s", self.instance_id, prerequisites_msg)
+            return self._create_result(
+                command=cmd,
+                stdout='',
+                stderr=prerequisites_msg,
+                exited=255,
+                ignore_status=ignore_status
+            )
+
+        try:
+            if verbose:
+                LOGGER.debug("Sending SSM command to instance %s: %s", self.instance_id, cmd)
+
+            # Send the command
+            send_params = {
+                'InstanceIds': [self.instance_id],
+                'DocumentName': 'AWS-RunShellScript',
+                'Parameters': {'commands': [cmd]},
+                'TimeoutSeconds': int(timeout) if timeout else 300,
+            }
+            if comment:
+                send_params['Comment'] = comment
+
+            response = self.ssm_client.send_command(**send_params)
+
+            command_id = response['Command']['CommandId']
+            if verbose:
+                LOGGER.debug("SSM Command ID: %s. Waiting for command execution...", command_id)
+
+            # Poll for command completion
+            status = 'Pending'
+            invocation_response = None
+            max_wait_attempts = int((timeout or 300) / 5) if timeout else 60
+
+            for attempt in range(max_wait_attempts):
+                time.sleep(5)
+
+                invocation_response = self.ssm_client.get_command_invocation(
+                    CommandId=command_id,
+                    InstanceId=self.instance_id
+                )
+
+                status = invocation_response['Status']
+                if verbose:
+                    LOGGER.debug("SSM command %s status (attempt %d/%d): %s",
+                                 command_id, attempt + 1, max_wait_attempts, status)
+
+                if status in ['Success', 'Failed', 'Cancelled', 'TimedOut']:
+                    break
+
+            # Retrieve the output
+            stdout = invocation_response.get('StandardOutputContent', '')
+            stderr = invocation_response.get('StandardErrorContent', '')
+
+            # Map SSM status to exit code
+            exit_code = 0 if status == 'Success' else 1
+            if status == 'TimedOut':
+                exit_code = 124
+            elif status == 'Cancelled':
+                exit_code = 130
+
+            if status != 'Success' and verbose:
+                LOGGER.error("SSM command failed (Status: %s). Error: %s", status, stderr)
+
+            # Save to log file if requested
+            if log_file:
+                try:
+                    with open(log_file, 'w', encoding='utf-8') as f:
+                        if stdout:
+                            f.write(stdout)
+                        if stderr:
+                            f.write('\n--- STDERR ---\n')
+                            f.write(stderr)
+                except IOError as e:
+                    LOGGER.error("Failed to save SSM output to file %s: %s", log_file, e)
+
+            return self._create_result(
+                command=cmd,
+                stdout=stdout,
+                stderr=stderr,
+                exited=exit_code,
+                ignore_status=ignore_status
+            )
+
+        except ClientError as e:
+            error_code = e.response['Error']['Code']
+            error_msg = f"AWS error during SSM command execution: {error_code} - {e}"
+            LOGGER.error(error_msg)
+            return self._create_result(
+                command=cmd,
+                stdout='',
+                stderr=error_msg,
+                exited=255,
+                ignore_status=ignore_status
+            )
+        except (KeyError, IndexError) as e:
+            error_msg = f"Unexpected error during SSM command execution: {e}"
+            LOGGER.error(error_msg)
+            return self._create_result(
+                command=cmd,
+                stdout='',
+                stderr=str(e),
+                exited=255,
+                ignore_status=ignore_status
+            )
+
+    @staticmethod
+    def _create_result(command: str, stdout: str, stderr: str, exited: int, ignore_status: bool = False) -> Result:
+        """
+        Create a Result object compatible with invoke.runners.Result.
+
+        Args:
+            command: The command that was executed
+            stdout: Standard output content
+            stderr: Standard error content
+            exited: Exit code
+            ignore_status: If True, do not raise exception on failure
+
+        Returns:
+            Result object
+        """
+        # Create a Result object by manually setting its attributes
+        # Result objects are normally created by invoke's Runner, but we need to create one manually
+        result = object.__new__(Result)
+        result.command = command
+        result.stdout = stdout
+        result.stderr = stderr
+        result.exited = exited
+        result.encoding = 'utf-8'
+        result.hide = False
+        result.pty = False
+        result.env = {}
+
+        # Note: 'ok', 'return_code', and 'exit_status' are read-only properties
+        # that are computed from 'exited', so we don't need to set them
+
+        return result
+
+    def run_command_and_save_output(
+        self,
+        command: str,
+        local_output_file: str,
+        comment: Optional[str] = None,
+        timeout: Optional[float] = 300,
+        ignore_status: bool = False
+    ) -> Result:
+        """
+        Run a command on an EC2 instance and save its output to a local file.
+
+        This is a convenience wrapper around run() that automatically saves to log_file.
+
+        Args:
+            command: Shell command to execute
+            local_output_file: Local file path to save the command output
+            comment: Optional comment for the SSM command
+            timeout: Command execution timeout in seconds
+            ignore_status: If True, do not raise exception on command failure
+
+        Returns:
+            Result object from the command execution
+        """
+        return self.run(
+            cmd=command,
+            timeout=timeout,
+            ignore_status=ignore_status,
+            verbose=True,
+            log_file=local_output_file,
+            comment=comment or f'Run command and save to {local_output_file}'
+        )
diff --git a/unit_tests/test_aws_ssm_runner.py b/unit_tests/test_aws_ssm_runner.py
new file mode 100644
index 00000000000..690ac32c901
--- /dev/null
+++ b/unit_tests/test_aws_ssm_runner.py
@@ -0,0 +1,183 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See LICENSE for more details.
+#
+# Copyright (c) 2025 ScyllaDB
+
+"""
+Unit tests for AWS SSM Runner module.
+"""
+
+import pytest
+from unittest.mock import patch, MagicMock, mock_open
+from sdcm.utils.aws_ssm_runner import SSMCommandRunner
+
+
+@pytest.fixture
+def region():
+    """AWS region fixture."""
+    return 'us-east-1'
+
+
+@pytest.fixture
+def instance_id():
+    """EC2 instance ID fixture."""
+    return 'i-1234567890abcdef0'
+
+
+@pytest.fixture
+def mock_boto_clients():
+    """Mock boto3 clients."""
+    mock_ssm = MagicMock()
+    mock_ec2 = MagicMock()
+    return mock_ssm, mock_ec2
+
+
+@patch('sdcm.utils.aws_ssm_runner.boto3.client')
+def test_init(mock_boto_client, region, instance_id):
+    """Test SSMCommandRunner initialization."""
+    runner = SSMCommandRunner(region, instance_id)
+
+    assert runner.region_name == region
+    assert runner.instance_id == instance_id
+    assert mock_boto_client.call_count == 2
+    mock_boto_client.assert_any_call('ssm', region_name=region)
+    mock_boto_client.assert_any_call('ec2', region_name=region)
+
+
+@patch('sdcm.utils.aws_ssm_runner.boto3.client')
+def test_check_ssm_prerequisites_success(mock_boto_client, region, instance_id, mock_boto_clients):
+    """Test successful SSM prerequisites check."""
+    mock_ssm, mock_ec2 = mock_boto_clients
+    mock_boto_client.side_effect = [mock_ssm, mock_ec2]
+
+    # Mock EC2 response
+    mock_ec2.describe_instances.return_value = {
+        'Reservations': [{
+            'Instances': [{
+                'State': {'Name': 'running'}
+            }]
+        }]
+    }
+
+    # Mock SSM response
+    mock_ssm.describe_instance_information.return_value = {
+        'InstanceInformationList': [{
+            'PingStatus': 'Online'
+        }]
+    }
+
+    runner = SSMCommandRunner(region, instance_id)
+    success, message = runner.check_ssm_prerequisites()
+
+    assert success is True
+    assert message == 'SUCCESS'
+
+
+@patch('sdcm.utils.aws_ssm_runner.boto3.client')
+def test_check_ssm_prerequisites_not_running(mock_boto_client, region, instance_id, mock_boto_clients):
+    """Test SSM prerequisites check with non-running instance."""
+    mock_ssm, mock_ec2 = mock_boto_clients
+    mock_boto_client.side_effect = [mock_ssm, mock_ec2]
+
+    # Mock EC2 response with stopped instance
+    mock_ec2.describe_instances.return_value = {
+        'Reservations': [{
+            'Instances': [{
+                'State': {'Name': 'stopped'}
+            }]
+        }]
+    }
+
+    runner = SSMCommandRunner(region, instance_id)
+    success, message = runner.check_ssm_prerequisites()
+
+    assert success is False
+    assert 'stopped' in message
+
+
+@patch('sdcm.utils.aws_ssm_runner.boto3.client')
+@patch('sdcm.utils.aws_ssm_runner.time.sleep')
+def test_run_success(mock_sleep, mock_boto_client, region, instance_id, mock_boto_clients):
+    """Test successful command execution."""
+    mock_ssm, mock_ec2 = mock_boto_clients
+    mock_boto_client.side_effect = [mock_ssm, mock_ec2]
+
+    # Mock prerequisites check
+    mock_ec2.describe_instances.return_value = {
+        'Reservations': [{
+            'Instances': [{
+                'State': {'Name': 'running'}
+            }]
+        }]
+    }
+    mock_ssm.describe_instance_information.return_value = {
+        'InstanceInformationList': [{
+            'PingStatus': 'Online'
+        }]
+    }
+
+    # Mock command execution
+    mock_ssm.send_command.return_value = {
+        'Command': {'CommandId': 'cmd-123'}
+    }
+    mock_ssm.get_command_invocation.return_value = {
+        'Status': 'Success',
+        'StandardOutputContent': 'Hello World',
+        'StandardErrorContent': ''
+    }
+
+    runner = SSMCommandRunner(region, instance_id)
+    result = runner.run(cmd='echo "Hello World"')
+
+    assert result.ok is True
+    assert result.stdout == 'Hello World'
+    assert result.exited == 0
+
+
+@patch('sdcm.utils.aws_ssm_runner.boto3.client')
+@patch('builtins.open', new_callable=mock_open)
+@patch('sdcm.utils.aws_ssm_runner.time.sleep')
+def test_run_with_log_file(mock_sleep, mock_file_open, mock_boto_client, region, instance_id, mock_boto_clients):
+    """Test command execution with output saved to file."""
+    mock_ssm, mock_ec2 = mock_boto_clients
+    mock_boto_client.side_effect = [mock_ssm, mock_ec2]
+
+    # Mock prerequisites and command execution
+    mock_ec2.describe_instances.return_value = {
+        'Reservations': [{
+            'Instances': [{
+                'State': {'Name': 'running'}
+            }]
+        }]
+    }
+    mock_ssm.describe_instance_information.return_value = {
+        'InstanceInformationList': [{
+            'PingStatus': 'Online'
+        }]
+    }
+    mock_ssm.send_command.return_value = {
+        'Command': {'CommandId': 'cmd-123'}
+    }
+    mock_ssm.get_command_invocation.return_value = {
+        'Status': 'Success',
+        'StandardOutputContent': 'Test output',
+        'StandardErrorContent': ''
+    }
+
+    runner = SSMCommandRunner(region, instance_id)
+    result = runner.run(
+        cmd='echo "Test"',
+        log_file='/tmp/output.txt'
+    )
+
+    assert result.ok is True
+    assert result.stdout == 'Test output'
+    mock_file_open.assert_called_once_with('/tmp/output.txt', 'w', encoding='utf-8')