|
28 | 28 | from dataclasses import dataclass |
29 | 29 |
|
30 | 30 | import requests |
31 | | -from invoke.exceptions import UnexpectedExit |
| 31 | +from invoke.exceptions import UnexpectedExit, Failure as InvokeFailure |
| 32 | +from botocore.exceptions import ClientError |
32 | 33 |
|
33 | 34 | import sdcm.monitorstack.ui as monitoring_ui |
34 | 35 | from sdcm.paths import SCYLLA_YAML_PATH, SCYLLA_PROPERTIES_PATH, SCYLLA_MANAGER_AGENT_YAML_PATH, \ |
|
65 | 66 | from sdcm.utils.gce_utils import gce_public_addresses, gce_private_addresses |
66 | 67 | from sdcm.localhost import LocalHost |
67 | 68 | from sdcm.cloud_api_client import ScyllaCloudAPIClient |
| 69 | +from sdcm.utils.aws_ssm_runner import SSMCommandRunner |
| 70 | +from sdcm.remote.libssh2_client.exceptions import Failure as Libssh2_Failure |
68 | 71 |
|
69 | 72 | LOGGER = logging.getLogger(__name__) |
70 | 73 |
|
@@ -97,6 +100,17 @@ def distro(self): |
97 | 100 | LOGGER.info("Detected Linux distribution: %s", _distro.name) |
98 | 101 | return _distro |
99 | 102 |
|
| 103 | + @cached_property |
| 104 | + def is_aws(self) -> bool: |
| 105 | + return isinstance(self._instance, dict) and self._instance.get("InstanceId") is not None |
| 106 | + |
| 107 | + @cached_property |
| 108 | + def aws_ssm_runner(self) -> Optional[SSMCommandRunner]: |
| 109 | + if self.is_aws: |
| 110 | + region = self._instance.get("Placement").get("AvailabilityZone")[:-1] |
| 111 | + return SSMCommandRunner(region_name=region, instance_id=self._instance.get("InstanceId")) |
| 112 | + return None |
| 113 | + |
100 | 114 | @retrying(n=30, sleep_time=15, allowed_exceptions=(UnexpectedExit, Libssh2_UnexpectedExit,)) |
101 | 115 | def install_package(self, |
102 | 116 | package_name: str, |
@@ -217,18 +231,24 @@ class CommandLog(BaseLogEntity): |
217 | 231 | def collect(self, node, local_dst, remote_dst=None, local_search_path=None) -> Optional[str]: |
218 | 232 | if not node or not node.remoter or remote_dst is None: |
219 | 233 | return None |
220 | | - remote_logfile = LogCollector.collect_log_remotely(node=node, |
221 | | - cmd=self.cmd, |
222 | | - log_filename=os.path.join(remote_dst, self.name)) |
| 234 | + |
| 235 | + remote_logfile, is_file_remote = LogCollector.collect_log_remotely(node=node, |
| 236 | + cmd=self.cmd, |
| 237 | + log_filename=os.path.join(remote_dst, self.name)) |
223 | 238 | if not remote_logfile: |
224 | 239 | LOGGER.warning( |
225 | 240 | "Nothing to collect. Command '%s' did not prepare log file on remote host '%s'", self.cmd, node.name) |
226 | 241 | return None |
227 | | - LogCollector.receive_log(node=node, |
228 | | - remote_log_path=remote_logfile, |
229 | | - local_dir=local_dst, |
230 | | - timeout=self.collect_timeout) |
231 | | - return os.path.join(local_dst, os.path.basename(remote_logfile)) |
| 242 | + local_path = Path(local_dst) / Path(remote_logfile).name |
| 243 | + if is_file_remote: |
| 244 | + LogCollector.receive_log(node=node, |
| 245 | + remote_log_path=remote_logfile, |
| 246 | + local_dir=local_dst, |
| 247 | + timeout=self.collect_timeout) |
| 248 | + else: |
| 249 | + # copy locally |
| 250 | + shutil.copyfile(remote_logfile, str(local_path)) |
| 251 | + return str(local_path) |
232 | 252 |
|
233 | 253 |
|
234 | 254 | class FileLog(CommandLog): |
@@ -617,29 +637,76 @@ def create_local_storage_dir(self, base_local_dir): |
617 | 637 | def create_remote_storage_dir(self, node, path=''): |
618 | 638 | if not path: |
619 | 639 | path = node.name |
620 | | - try: |
621 | | - remote_dir = os.path.join(self.node_remote_dir, path) |
622 | | - result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True) |
| 640 | + remote_dir = os.path.join(self.node_remote_dir, path) |
623 | 641 |
|
624 | | - if result.exited > 0: |
625 | | - LOGGER.error( |
626 | | - 'Remote storing folder not created.\n{}'.format(result)) |
627 | | - remote_dir = self.node_remote_dir |
| 642 | + if ssh_connected := node.remoter.is_up(): |
628 | 643 |
|
629 | | - except Exception as details: # noqa: BLE001 |
630 | | - LOGGER.error("Error during creating remote directory %s", details) |
| 644 | + try: |
| 645 | + result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True) |
| 646 | + |
| 647 | + if result.exited > 0: |
| 648 | + LOGGER.error( |
| 649 | + 'Remote storing folder not created.\n{}'.format(result)) |
| 650 | + remote_dir = self.node_remote_dir |
| 651 | + |
| 652 | + except (Libssh2_Failure, InvokeFailure) as details: |
| 653 | + LOGGER.error("Error during creating remote directory %s", details) |
| 654 | + elif not ssh_connected and (ssm_runner := node.aws_ssm_runner): |
| 655 | + try: |
| 656 | + ssm_result = ssm_runner.run('mkdir -p {}'.format(remote_dir), ignore_status=True) |
| 657 | + ok = ssm_result.ok |
| 658 | + if not ok: |
| 659 | + LOGGER.error("SSM command failed for instance %s: mkdir", node._instance.get("InstanceId")) |
| 660 | + |
| 661 | + except (ClientError, AttributeError) as e: |
| 662 | + LOGGER.error("Failed to run SSM command: %s", e) |
| 663 | + remote_dir = self.node_remote_dir |
| 664 | + else: |
631 | 665 | remote_dir = self.node_remote_dir |
632 | 666 |
|
633 | 667 | return remote_dir |
634 | 668 |
|
635 | 669 | @staticmethod |
636 | | - def collect_log_remotely(node, cmd: str, log_filename: str) -> Optional[str]: |
| 670 | + def collect_log_remotely(node, cmd: str, log_filename: str) -> Tuple[Optional[str], bool]: |
637 | 671 | if not node.remoter: |
638 | | - return None |
639 | | - collect_log_command = f"{cmd} > '{log_filename}' 2>&1" |
640 | | - node.remoter.run(collect_log_command, ignore_status=True, verbose=True) |
641 | | - result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True) |
642 | | - return log_filename if result.ok else None |
| 672 | + return None, False |
| 673 | + |
| 674 | + is_file_remote = True |
| 675 | + |
| 676 | + if ssh_connected := node.remoter.is_up(): |
| 677 | + try: |
| 678 | + collect_log_command = f"{cmd} > '{log_filename}' 2>&1" |
| 679 | + node.remoter.run(collect_log_command, ignore_status=True, verbose=True) |
| 680 | + result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True) |
| 681 | + ok = result.ok |
| 682 | + except (Libssh2_Failure, InvokeFailure): |
| 683 | + ssh_connected = False |
| 684 | + |
| 685 | + # Check if node is AWS-based |
| 686 | + if not ssh_connected and (ssm_runner := node.aws_ssm_runner): |
| 687 | + LOGGER.info("Collecting Node %s via SSM: %s", node.name, log_filename) |
| 688 | + Path(log_filename).parent.mkdir(parents=True, exist_ok=True) |
| 689 | + |
| 690 | + # Use SSM to run the command and save it to a local file |
| 691 | + is_file_remote = False |
| 692 | + |
| 693 | + try: |
| 694 | + collect_log_command = f"{cmd}" |
| 695 | + ssm_result = ssm_runner.run_command_and_save_output( |
| 696 | + command=collect_log_command, |
| 697 | + local_output_file=log_filename, |
| 698 | + comment=f'Collect log {log_filename}', |
| 699 | + ignore_status=True |
| 700 | + ) |
| 701 | + ok = ssm_result.ok |
| 702 | + if not ssm_result.ok: |
| 703 | + LOGGER.error("SSM command failed for instance %s: %s ", |
| 704 | + node._instance.get("InstanceId"), collect_log_command) |
| 705 | + return None, is_file_remote |
| 706 | + except (ImportError, AttributeError, TypeError, ValueError, KeyError, IndexError) as e: |
| 707 | + LOGGER.error("Failed to run SSM command: %s", e) |
| 708 | + return None, is_file_remote |
| 709 | + return log_filename if ok else None, is_file_remote |
643 | 710 |
|
644 | 711 | @staticmethod |
645 | 712 | def archive_log_remotely(node, log_filename: str, archive_name: Optional[str] = None) -> Optional[str]: |
|
0 commit comments