diff --git a/README.md b/README.md index 0b434018a78..06eb0be55f3 100644 --- a/README.md +++ b/README.md @@ -165,14 +165,4 @@ the available configuration options are listed in [configuration_options](./docs ### Manager Tests (TODO: write explanation for them) ### [K8S Functional Tests](./docs/k8s-functional-test.md) ### [Microbenchmarking Tests](./docs/microbenchmarking.md) - -# Local developer setup - -* Requires [uv](https://github.com/astral-sh/uv#installation) - -```bash -uv python install -uv venv -export UV_PROJECT_ENVIRONMENT=.venv -uv sync -``` +### [Performance Tests](./docs/performance-tests.md) diff --git a/docs/performance-tests.md b/docs/performance-tests.md new file mode 100644 index 00000000000..2f8eaf88f03 --- /dev/null +++ b/docs/performance-tests.md @@ -0,0 +1,40 @@ +# Performance Tests + +## HDR investigate utility + +The hdr_investigate utility is essential for performance analysis because it allows users to scan HDR (High Dynamic Range) histogram files +with fine-grained time intervals, rather than only looking at overall or coarse-grained metrics. +By analyzing latency metrics (such as P99) in smaller intervals, the tool helps pinpoint the exact time windows where latency spikes occur. +This makes it possible to correlate these spikes with specific events or Scylla processes, enabling users to identify which Scylla process +or operation is causing performance problems. +This targeted approach greatly improves the efficiency and accuracy of performance troubleshooting in distributed database environments. + +Key features: +- Supports multiple stress tools and operations (READ/WRITE). +- Can fetch HDR files from Argus by test ID or use a local folder. +- Allows specifying the time window and scan interval for analysis. +- Reports intervals where P99 latency exceeds a user-defined threshold. + +Usage example: + +```bash +hydra hdr-investigate \ + --stress-operation READ \ + --throttled-load true \ + --test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 \ + --start-time "2025-09-14\ 20:45:18" \ + --duration-from-start-min 30 +``` + +Main options: +- --test-id: Test run identifier (fetches logs from Argus if --hdr-folder is not provided). +- --stress-tool: Name of the stress tool (cassandra-stress, scylla-bench, or latte) (default: cassandra-stress). +- --stress-operation: Operation type (READ or WRITE). +- --throttled-load: Whether the load was throttled (True or False). +- --start-time: Start time for analysis (format: YYYY-MM-DD\ HH:MM:SS). +- --duration-from-start-min: Duration in minutes to analyze from the start time. +- --error-threshold-ms: P99 latency threshold in milliseconds (default: 10). +- --hdr-summary-interval-sec: Interval in seconds for summary scan (default: 600). +- --hdr-folder: Path to local folder with HDR files (optional). + +This utility is useful for performance engineers and developers investigating latency issues in distributed database clusters. diff --git a/sct.py b/sct.py index 05e56967dd6..e19a8112bfe 100755 --- a/sct.py +++ b/sct.py @@ -13,7 +13,7 @@ # Copyright (c) 2021 ScyllaDB from collections import defaultdict -from datetime import datetime, timezone, timedelta +from datetime import datetime, timezone, timedelta, UTC import os import re import sys @@ -51,6 +51,7 @@ from sdcm.sct_provision.instances_provider import provision_sct_resources from sdcm.sct_runner import AwsSctRunner, GceSctRunner, AzureSctRunner, get_sct_runner, clean_sct_runners, \ update_sct_runner_tags, list_sct_runners + from sdcm.utils.ci_tools import get_job_name, get_job_url from sdcm.utils.git import get_git_commit_id, get_git_status_info from sdcm.utils.argus import argus_offline_collect_events, create_proxy_argus_s3_url, get_argus_client @@ -83,7 +84,8 @@ list_resources_docker, list_parallel_timelines_report_urls, search_test_id_in_latest, - get_latest_scylla_release, images_dict_in_json_format, + get_latest_scylla_release, images_dict_in_json_format, get_hdr_tags, + download_and_unpack_logs, ) from sdcm.utils.nemesis_generation import generate_nemesis_yaml, NemesisJobGenerator from sdcm.utils.open_with_diff import OpenWithDiff, ErrorCarrier @@ -1933,6 +1935,107 @@ def upload_artifact_file(test_id: str, file_path: str, use_argus: bool, public: return +@cli.command("hdr-investigate", help="Analyze HDR file for latency spikes.\n" + "Usage example:\n" + "hydra hdr-investigate --stress-operation READ --throttled-load true " + "--test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 --start-time \"2025-09-14\\ 20:45:18\" " + "--duration-from-start-min 30") +@click.option("--test-id", type=str, required=False, help="If hdr_folder is not provided, logs will be downloaded from argus " + "using this test_id") +@click.option("--stress-tool", default="cassandra-stress", required=False, + type=click.Choice(['cassandra-stress', 'scylla-bench', 'latte'], case_sensitive=False), + help="stress tool name. Supported tools: cassandra-stress|scylla-bench|latte") +@click.option("--stress-operation", required=True, + type=click.Choice(["READ", "WRITE"], case_sensitive=False), + help="Supported stress operations: READ|WRITE") +@click.option("--throttled-load", type=bool, required=True, help="Is the load throttled or not") +@click.option("--start-time", type=str, required=True, help="Start time in format 'YYYY-MM-DD\\ HH:MM:SS'") +@click.option("--duration-from-start-min", type=int, required=True, + help="Time period in minutes in HDR file to investigate, started from start-time ") +@click.option("--error-threshold-ms", type=int, default=10, required=False, + help="Error threshold in ms for P99 to consider it as a spike") +@click.option("--hdr-summary-interval-sec", type=int, default=600, required=False, help="Interval in seconds for scan") +@click.option("--hdr-folder", type=str, default=None, required=False, help="Path to folder with hdr files. ") +def hdr_investigate(test_id: str, stress_tool: str, stress_operation: str, throttled_load: bool, start_time: str, + duration_from_start_min: int, error_threshold_ms: int, hdr_summary_interval_sec: int, hdr_folder: str) -> None: + """ + Analyze HDR file for latency spikes. + + This function scans HDR files to identify intervals with high latency spikes. + It performs a scan to find intervals with the highest P99 latency + Args: + test_id (str): Test ID. If `hdr_folder` is not provided, logs will be downloaded from Argus using this ID to /tmp/ folder. + stress_tool (str): Name of the stress tool. Supported: cassandra-stress, scylla-bench, latte. + stress_operation (str): Stress operation type. Supported: READ, WRITE. + throttled_load (bool): Whether the load is throttled. + start_time (str): Start time in format 'YYYY-MM-DD HH:MM:SS'. + duration_from_start_min (int): Time period in minutes in HDR file to investigate, starting from `start_time`. + error_threshold_ms (int): Error threshold in ms for P99 to consider it as a spike. + hdr_summary_interval_sec (int): Interval in seconds for scan. + hdr_folder (str): Path to folder with HDR files. If not provided, logs will be downloaded from Argus. + + Returns: + None + + Usage example: + hydra hdr-investigate --stress-operation READ --throttled-load true --test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 + --start-time \"2025-09-14\\ 20:45:18\" --duration-from-start-min 30 + """ + from sdcm.utils.hdrhistogram import make_hdrhistogram_summary_by_interval + stress_operation = stress_operation.upper() + + try: + start_time_ms = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC).timestamp() + except ValueError: + raise ValueError("start_time must be in 'YYYY-MM-DD HH:MM:SS' format") + + if not hdr_folder: + if not test_id: + raise ValueError("Either test_id or hdr_folder must be provided") + + hdr_folder = download_and_unpack_logs(test_id, log_type='loader-set') + + hdr_tags = get_hdr_tags(stress_tool, stress_operation, throttled_load) + + # Step 1: Coarse scan + end_time_ms = start_time_ms + duration_from_start_min * 60 + hdr_summaries = make_hdrhistogram_summary_by_interval( + hdr_tags, stress_operation, hdr_folder, start_time_ms, end_time_ms, interval=hdr_summary_interval_sec + ) + + summaries = [] + for tag in hdr_tags: + # Find intervals for this tag + tag_hdr_summaries = [ + (summary.get(f"{stress_operation}--{tag}", {})) + for summary in hdr_summaries + ] + # Filter intervals where percentile_99 > error threshold. Meaning it is a spike + tag_summaries = [ + s for s in tag_hdr_summaries if s and "percentile_99" in s and s["percentile_99"] > error_threshold_ms + ] + # Sort by percentile_99 descending + tag_summaries.sort(key=lambda x: x["percentile_99"], reverse=True) + for num, operation in enumerate(tag_summaries, start=1): + p99 = {} + # Convert to human readable time + dt_start = datetime.fromtimestamp(operation["start_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S') + dt_end = datetime.fromtimestamp(operation["end_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S') + if operation['percentile_99'] > error_threshold_ms: + p99[f"{dt_start} - {dt_end}"] = operation['percentile_99'] + summaries.append({"tag": tag, "spikes": p99}) + + hdr_table = PrettyTable(["Tag", "Timeframe", "P99"]) + hdr_table.align = "l" + for group in summaries: + # Example: group: {'tag': 'READ-st', 'spikes': {'2025-08-25 10:22:40 - 2025-08-25 10:32:40': 487.85}} + for time_frame, p99 in group["spikes"].items(): + hdr_table.add_row([group["tag"], time_frame, p99]) + click.echo( + f"\nFound P99 spikes higher than {error_threshold_ms} ms for tags {hdr_tags} with interval {hdr_summary_interval_sec} seconds\n") + click.echo(hdr_table.get_string(title="HDR Latency Spikes")) + + cli.add_command(sct_ssh.ssh) cli.add_command(sct_ssh.tunnel) cli.add_command(sct_ssh.copy_cmd) diff --git a/sdcm/utils/common.py b/sdcm/utils/common.py index 0d86652f10e..9fb544ed439 100644 --- a/sdcm/utils/common.py +++ b/sdcm/utils/common.py @@ -2568,3 +2568,51 @@ def format_size(size_in_bytes): if size_in_bytes < 1024: return f"{size_in_bytes:.2f} {unit}" size_in_bytes /= 1024 + + +def get_hdr_tags(stress_tool: str, stress_operation: str, throttled_load: bool) -> list: + match stress_tool: + case "cassandra-stress": + suffix = "-rt" if throttled_load else "-st" + if stress_operation == "MIXED": + hdr_tags = [f"WRITE{suffix}", f"READ{suffix}"] + elif stress_operation == "READ": + hdr_tags = [f"READ{suffix}"] + elif stress_operation == "WRITE": + hdr_tags = [f"WRITE{suffix}"] + else: + raise ValueError(f"Unsupported stress_operation: {stress_operation}") + case "scylla-bench": + # TODO: will be defined later + raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.") + case "latte": + # TODO: will be defined later + raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.") + case _: + raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.") + + return hdr_tags + + +def download_and_unpack_logs(test_id: str, log_type: str, download_to: str = None) -> str: + logs_links = list_logs_by_test_id(test_id) + tmp_dir = download_to or os.path.join('/tmp/', test_id) + if not os.path.exists(tmp_dir): + os.makedirs(tmp_dir) + logs_file = "" + for log in logs_links: + if log["type"] == log_type: + logs_file = S3Storage().download_file(link=log["link"], dst_dir=tmp_dir) + LOGGER.debug("Downloaded %slog to %s", log_type, logs_file) + + if not logs_file: + raise ValueError("%s not found in argus logs", log_type) + + LOGGER.debug("Unpacking loader logs...") + from sdcm.monitorstack import extract_file_from_tar_archive + hdr_folder = extract_file_from_tar_archive(pattern=log_type, archive=logs_file, extract_dir=tmp_dir) + LOGGER.debug("%s logs unpacked to %s", log_type, hdr_folder[test_id]) + if not hdr_folder: + raise ValueError(f"Failed to unpack logs {logs_file} for test_id {test_id}") + + return hdr_folder[test_id] diff --git a/sdcm/utils/hdrhistogram.py b/sdcm/utils/hdrhistogram.py index c5c17da7d4f..982547834cf 100644 --- a/sdcm/utils/hdrhistogram.py +++ b/sdcm/utils/hdrhistogram.py @@ -163,7 +163,7 @@ def build_histograms_summary_with_interval(self, path: str, try: start_ts = int(self.start_time) end_ts = int(self.end_time) - if end_ts - start_ts < TIME_INTERVAL: + if end_ts - start_ts < interval: window_step = int(end_ts - start_ts) else: window_step = interval or TIME_INTERVAL