|
27 | 27 | from functools import cached_property |
28 | 28 |
|
29 | 29 | import requests |
30 | | -from invoke.exceptions import UnexpectedExit |
| 30 | +from invoke.exceptions import UnexpectedExit, Failure as InvokeFailure |
| 31 | +from botocore.exceptions import ClientError |
31 | 32 |
|
32 | 33 | import sdcm.monitorstack.ui as monitoring_ui |
33 | 34 | from sdcm.paths import SCYLLA_YAML_PATH, SCYLLA_PROPERTIES_PATH, SCYLLA_MANAGER_AGENT_YAML_PATH, \ |
|
62 | 63 | from sdcm.utils.k8s import KubernetesOps |
63 | 64 | from sdcm.utils.s3_remote_uploader import upload_remote_files_directly_to_s3 |
64 | 65 | from sdcm.utils.gce_utils import gce_public_addresses, gce_private_addresses |
| 66 | +<<<<<<< HEAD |
| 67 | +||||||| parent of 5aad56a49 (feature(aws): log fallback to ssm based access) |
| 68 | +from sdcm.localhost import LocalHost |
| 69 | +from sdcm.cloud_api_client import ScyllaCloudAPIClient |
| 70 | +======= |
| 71 | +from sdcm.localhost import LocalHost |
| 72 | +from sdcm.cloud_api_client import ScyllaCloudAPIClient |
| 73 | +from sdcm.utils.aws_ssm_runner import SSMCommandRunner |
| 74 | +from sdcm.remote.libssh2_client.exceptions import Failure as Libssh2_Failure |
| 75 | +>>>>>>> 5aad56a49 (feature(aws): log fallback to ssm based access) |
65 | 76 |
|
66 | 77 | LOGGER = logging.getLogger(__name__) |
67 | 78 |
|
@@ -94,6 +105,17 @@ def distro(self): |
94 | 105 | LOGGER.info("Detected Linux distribution: %s", _distro.name) |
95 | 106 | return _distro |
96 | 107 |
|
| 108 | + @cached_property |
| 109 | + def is_aws(self) -> bool: |
| 110 | + return isinstance(self._instance, dict) and self._instance.get("InstanceId") is not None |
| 111 | + |
| 112 | + @cached_property |
| 113 | + def aws_ssm_runner(self) -> Optional[SSMCommandRunner]: |
| 114 | + if self.is_aws: |
| 115 | + region = self._instance.get("Placement").get("AvailabilityZone")[:-1] |
| 116 | + return SSMCommandRunner(region_name=region, instance_id=self._instance.get("InstanceId")) |
| 117 | + return None |
| 118 | + |
97 | 119 | @retrying(n=30, sleep_time=15, allowed_exceptions=(UnexpectedExit, Libssh2_UnexpectedExit,)) |
98 | 120 | def install_package(self, |
99 | 121 | package_name: str, |
@@ -214,18 +236,24 @@ class CommandLog(BaseLogEntity): |
214 | 236 | def collect(self, node, local_dst, remote_dst=None, local_search_path=None) -> Optional[str]: |
215 | 237 | if not node or not node.remoter or remote_dst is None: |
216 | 238 | return None |
217 | | - remote_logfile = LogCollector.collect_log_remotely(node=node, |
218 | | - cmd=self.cmd, |
219 | | - log_filename=os.path.join(remote_dst, self.name)) |
| 239 | + |
| 240 | + remote_logfile, is_file_remote = LogCollector.collect_log_remotely(node=node, |
| 241 | + cmd=self.cmd, |
| 242 | + log_filename=os.path.join(remote_dst, self.name)) |
220 | 243 | if not remote_logfile: |
221 | 244 | LOGGER.warning( |
222 | 245 | "Nothing to collect. Command '%s' did not prepare log file on remote host '%s'", self.cmd, node.name) |
223 | 246 | return None |
224 | | - LogCollector.receive_log(node=node, |
225 | | - remote_log_path=remote_logfile, |
226 | | - local_dir=local_dst, |
227 | | - timeout=self.collect_timeout) |
228 | | - return os.path.join(local_dst, os.path.basename(remote_logfile)) |
| 247 | + local_path = Path(local_dst) / Path(remote_logfile).name |
| 248 | + if is_file_remote: |
| 249 | + LogCollector.receive_log(node=node, |
| 250 | + remote_log_path=remote_logfile, |
| 251 | + local_dir=local_dst, |
| 252 | + timeout=self.collect_timeout) |
| 253 | + else: |
| 254 | + # copy locally |
| 255 | + shutil.copyfile(remote_logfile, str(local_path)) |
| 256 | + return str(local_path) |
229 | 257 |
|
230 | 258 |
|
231 | 259 | class FileLog(CommandLog): |
@@ -614,29 +642,76 @@ def create_local_storage_dir(self, base_local_dir): |
614 | 642 | def create_remote_storage_dir(self, node, path=''): |
615 | 643 | if not path: |
616 | 644 | path = node.name |
617 | | - try: |
618 | | - remote_dir = os.path.join(self.node_remote_dir, path) |
619 | | - result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True) |
| 645 | + remote_dir = os.path.join(self.node_remote_dir, path) |
620 | 646 |
|
621 | | - if result.exited > 0: |
622 | | - LOGGER.error( |
623 | | - 'Remote storing folder not created.\n{}'.format(result)) |
624 | | - remote_dir = self.node_remote_dir |
| 647 | + if ssh_connected := node.remoter.is_up(): |
625 | 648 |
|
626 | | - except Exception as details: # noqa: BLE001 |
627 | | - LOGGER.error("Error during creating remote directory %s", details) |
| 649 | + try: |
| 650 | + result = node.remoter.run('mkdir -p {}'.format(remote_dir), ignore_status=True) |
| 651 | + |
| 652 | + if result.exited > 0: |
| 653 | + LOGGER.error( |
| 654 | + 'Remote storing folder not created.\n{}'.format(result)) |
| 655 | + remote_dir = self.node_remote_dir |
| 656 | + |
| 657 | + except (Libssh2_Failure, InvokeFailure) as details: |
| 658 | + LOGGER.error("Error during creating remote directory %s", details) |
| 659 | + elif not ssh_connected and (ssm_runner := node.aws_ssm_runner): |
| 660 | + try: |
| 661 | + ssm_result = ssm_runner.run('mkdir -p {}'.format(remote_dir), ignore_status=True) |
| 662 | + ok = ssm_result.ok |
| 663 | + if not ok: |
| 664 | + LOGGER.error("SSM command failed for instance %s: mkdir", node._instance.get("InstanceId")) |
| 665 | + |
| 666 | + except (ClientError, AttributeError) as e: |
| 667 | + LOGGER.error("Failed to run SSM command: %s", e) |
| 668 | + remote_dir = self.node_remote_dir |
| 669 | + else: |
628 | 670 | remote_dir = self.node_remote_dir |
629 | 671 |
|
630 | 672 | return remote_dir |
631 | 673 |
|
632 | 674 | @staticmethod |
633 | | - def collect_log_remotely(node, cmd: str, log_filename: str) -> Optional[str]: |
| 675 | + def collect_log_remotely(node, cmd: str, log_filename: str) -> Tuple[Optional[str], bool]: |
634 | 676 | if not node.remoter: |
635 | | - return None |
636 | | - collect_log_command = f"{cmd} > '{log_filename}' 2>&1" |
637 | | - node.remoter.run(collect_log_command, ignore_status=True, verbose=True) |
638 | | - result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True) |
639 | | - return log_filename if result.ok else None |
| 677 | + return None, False |
| 678 | + |
| 679 | + is_file_remote = True |
| 680 | + |
| 681 | + if ssh_connected := node.remoter.is_up(): |
| 682 | + try: |
| 683 | + collect_log_command = f"{cmd} > '{log_filename}' 2>&1" |
| 684 | + node.remoter.run(collect_log_command, ignore_status=True, verbose=True) |
| 685 | + result = node.remoter.run(f"test -f '{log_filename}'", ignore_status=True) |
| 686 | + ok = result.ok |
| 687 | + except (Libssh2_Failure, InvokeFailure): |
| 688 | + ssh_connected = False |
| 689 | + |
| 690 | + # Check if node is AWS-based |
| 691 | + if not ssh_connected and (ssm_runner := node.aws_ssm_runner): |
| 692 | + LOGGER.info("Collecting Node %s via SSM: %s", node.name, log_filename) |
| 693 | + Path(log_filename).parent.mkdir(parents=True, exist_ok=True) |
| 694 | + |
| 695 | + # Use SSM to run the command and save it to a local file |
| 696 | + is_file_remote = False |
| 697 | + |
| 698 | + try: |
| 699 | + collect_log_command = f"{cmd}" |
| 700 | + ssm_result = ssm_runner.run_command_and_save_output( |
| 701 | + command=collect_log_command, |
| 702 | + local_output_file=log_filename, |
| 703 | + comment=f'Collect log {log_filename}', |
| 704 | + ignore_status=True |
| 705 | + ) |
| 706 | + ok = ssm_result.ok |
| 707 | + if not ssm_result.ok: |
| 708 | + LOGGER.error("SSM command failed for instance %s: %s ", |
| 709 | + node._instance.get("InstanceId"), collect_log_command) |
| 710 | + return None, is_file_remote |
| 711 | + except (ImportError, AttributeError, TypeError, ValueError, KeyError, IndexError) as e: |
| 712 | + LOGGER.error("Failed to run SSM command: %s", e) |
| 713 | + return None, is_file_remote |
| 714 | + return log_filename if ok else None, is_file_remote |
640 | 715 |
|
641 | 716 | @staticmethod |
642 | 717 | def archive_log_remotely(node, log_filename: str, archive_name: Optional[str] = None) -> Optional[str]: |
|
0 commit comments