improvement(sct utility): enhance the analysis of HDR files

juliayakovlev · fruch · commit 0cffffa031ce · 2025-10-28T15:43:48.000+02:00
Adding new SCT utility to enhance the analysis of HDR files by running build_histograms_summary_with_interval with a shorter interval. This will help identify the specific process causing high latency (cherry picked from commit 968cb0d)
diff --git a/README.md b/README.md
@@ -165,14 +165,4 @@ the available configuration options are listed in [configuration_options](./docs
 ### Manager Tests (TODO: write explanation for them)
 ### [K8S Functional Tests](./docs/k8s-functional-test.md)
 ### [Microbenchmarking Tests](./docs/microbenchmarking.md)
-
-# Local developer setup
-
-* Requires [uv](https://github.com/astral-sh/uv#installation)
-
-```bash
-uv python install
-uv venv
-export UV_PROJECT_ENVIRONMENT=.venv
-uv sync
-```
+### [Performance Tests](./docs/performance-tests.md)
diff --git a/docs/performance-tests.md b/docs/performance-tests.md
@@ -0,0 +1,40 @@
+# Performance Tests
+
+## HDR investigate utility
+
+The hdr_investigate utility is essential for performance analysis because it allows users to scan HDR (High Dynamic Range) histogram files
+with fine-grained time intervals, rather than only looking at overall or coarse-grained metrics.
+By analyzing latency metrics (such as P99) in smaller intervals, the tool helps pinpoint the exact time windows where latency spikes occur.
+This makes it possible to correlate these spikes with specific events or Scylla processes, enabling users to identify which Scylla process
+or operation is causing performance problems.
+This targeted approach greatly improves the efficiency and accuracy of performance troubleshooting in distributed database environments.
+
+Key features:
+- Supports multiple stress tools and operations (READ/WRITE).
+- Can fetch HDR files from Argus by test ID or use a local folder.
+- Allows specifying the time window and scan interval for analysis.
+- Reports intervals where P99 latency exceeds a user-defined threshold.
+
+Usage example:
+
+```bash
+hydra hdr-investigate \
+  --stress-operation READ \
+  --throttled-load true \
+  --test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 \
+  --start-time "2025-09-14\ 20:45:18" \
+  --duration-from-start-min 30
+```
+
+Main options:
+- --test-id: Test run identifier (fetches logs from Argus if --hdr-folder is not provided).
+- --stress-tool: Name of the stress tool (cassandra-stress, scylla-bench, or latte) (default: cassandra-stress).
+- --stress-operation: Operation type (READ or WRITE).
+- --throttled-load: Whether the load was throttled (True or False).
+- --start-time: Start time for analysis (format: YYYY-MM-DD\ HH:MM:SS).
+- --duration-from-start-min: Duration in minutes to analyze from the start time.
+- --error-threshold-ms: P99 latency threshold in milliseconds (default: 10).
+- --hdr-summary-interval-sec: Interval in seconds for summary scan (default: 600).
+- --hdr-folder: Path to local folder with HDR files (optional).
+
+This utility is useful for performance engineers and developers investigating latency issues in distributed database clusters.
diff --git a/sct.py b/sct.py
@@ -13,7 +13,7 @@
 # Copyright (c) 2021 ScyllaDB
 
 from collections import defaultdict
-from datetime import datetime, timezone, timedelta
+from datetime import datetime, timezone, timedelta, UTC
 import os
 import re
 import sys
@@ -51,6 +51,7 @@
 from sdcm.sct_provision.instances_provider import provision_sct_resources
 from sdcm.sct_runner import AwsSctRunner, GceSctRunner, AzureSctRunner, get_sct_runner, clean_sct_runners, \
     update_sct_runner_tags, list_sct_runners
+
 from sdcm.utils.ci_tools import get_job_name, get_job_url
 from sdcm.utils.git import get_git_commit_id, get_git_status_info
 from sdcm.utils.argus import argus_offline_collect_events, create_proxy_argus_s3_url, get_argus_client
@@ -83,7 +84,8 @@
     list_resources_docker,
     list_parallel_timelines_report_urls,
     search_test_id_in_latest,
-    get_latest_scylla_release, images_dict_in_json_format,
+    get_latest_scylla_release, images_dict_in_json_format, get_hdr_tags,
+    download_and_unpack_logs,
 )
 from sdcm.utils.nemesis_generation import generate_nemesis_yaml, NemesisJobGenerator
 from sdcm.utils.open_with_diff import OpenWithDiff, ErrorCarrier
@@ -1933,6 +1935,107 @@ def upload_artifact_file(test_id: str, file_path: str, use_argus: bool, public:
         return
 
 
+@cli.command("hdr-investigate", help="Analyze HDR file for latency spikes.\n"
+                                     "Usage example:\n"
+                                     "hydra hdr-investigate --stress-operation READ --throttled-load true "
+                                     "--test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 --start-time \"2025-09-14\\ 20:45:18\" "
+                                     "--duration-from-start-min 30")
+@click.option("--test-id", type=str, required=False, help="If hdr_folder is not provided, logs will be downloaded from argus "
+                                                          "using this test_id")
+@click.option("--stress-tool", default="cassandra-stress", required=False,
+              type=click.Choice(['cassandra-stress', 'scylla-bench', 'latte'], case_sensitive=False),
+              help="stress tool name. Supported tools: cassandra-stress|scylla-bench|latte")
+@click.option("--stress-operation", required=True,
+              type=click.Choice(["READ", "WRITE"], case_sensitive=False),
+              help="Supported stress operations: READ|WRITE")
+@click.option("--throttled-load", type=bool, required=True, help="Is the load throttled or not")
+@click.option("--start-time", type=str, required=True, help="Start time in format 'YYYY-MM-DD\\ HH:MM:SS'")
+@click.option("--duration-from-start-min", type=int, required=True,
+              help="Time period in minutes in HDR file to investigate, started from start-time ")
+@click.option("--error-threshold-ms", type=int, default=10, required=False,
+              help="Error threshold in ms for P99 to consider it as a spike")
+@click.option("--hdr-summary-interval-sec", type=int, default=600, required=False, help="Interval in seconds for scan")
+@click.option("--hdr-folder", type=str, default=None, required=False, help="Path to folder with hdr files. ")
+def hdr_investigate(test_id: str, stress_tool: str, stress_operation: str, throttled_load: bool, start_time: str,
+                    duration_from_start_min: int, error_threshold_ms: int, hdr_summary_interval_sec: int, hdr_folder: str) -> None:
+    """
+    Analyze HDR file for latency spikes.
+
+    This function scans HDR files to identify intervals with high latency spikes.
+    It performs a scan to find intervals with the highest P99 latency
+    Args:
+        test_id (str): Test ID. If `hdr_folder` is not provided, logs will be downloaded from Argus using this ID to /tmp/<test-id> folder.
+        stress_tool (str): Name of the stress tool. Supported: cassandra-stress, scylla-bench, latte.
+        stress_operation (str): Stress operation type. Supported: READ, WRITE.
+        throttled_load (bool): Whether the load is throttled.
+        start_time (str): Start time in format 'YYYY-MM-DD HH:MM:SS'.
+        duration_from_start_min (int): Time period in minutes in HDR file to investigate, starting from `start_time`.
+        error_threshold_ms (int): Error threshold in ms for P99 to consider it as a spike.
+        hdr_summary_interval_sec (int): Interval in seconds for scan.
+        hdr_folder (str): Path to folder with HDR files. If not provided, logs will be downloaded from Argus.
+
+    Returns:
+        None
+
+    Usage example:
+       hydra hdr-investigate --stress-operation READ --throttled-load true --test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5
+       --start-time \"2025-09-14\\ 20:45:18\" --duration-from-start-min 30
+    """
+    from sdcm.utils.hdrhistogram import make_hdrhistogram_summary_by_interval
+    stress_operation = stress_operation.upper()
+
+    try:
+        start_time_ms = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC).timestamp()
+    except ValueError:
+        raise ValueError("start_time must be in 'YYYY-MM-DD HH:MM:SS' format")
+
+    if not hdr_folder:
+        if not test_id:
+            raise ValueError("Either test_id or hdr_folder must be provided")
+
+        hdr_folder = download_and_unpack_logs(test_id, log_type='loader-set')
+
+    hdr_tags = get_hdr_tags(stress_tool, stress_operation, throttled_load)
+
+    # Step 1: Coarse scan
+    end_time_ms = start_time_ms + duration_from_start_min * 60
+    hdr_summaries = make_hdrhistogram_summary_by_interval(
+        hdr_tags, stress_operation, hdr_folder, start_time_ms, end_time_ms, interval=hdr_summary_interval_sec
+    )
+
+    summaries = []
+    for tag in hdr_tags:
+        # Find intervals for this tag
+        tag_hdr_summaries = [
+            (summary.get(f"{stress_operation}--{tag}", {}))
+            for summary in hdr_summaries
+        ]
+        # Filter intervals where percentile_99 > error threshold. Meaning it is a spike
+        tag_summaries = [
+            s for s in tag_hdr_summaries if s and "percentile_99" in s and s["percentile_99"] > error_threshold_ms
+        ]
+        # Sort by percentile_99 descending
+        tag_summaries.sort(key=lambda x: x["percentile_99"], reverse=True)
+        for num, operation in enumerate(tag_summaries, start=1):
+            p99 = {}
+            # Convert to human readable time
+            dt_start = datetime.fromtimestamp(operation["start_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S')
+            dt_end = datetime.fromtimestamp(operation["end_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S')
+            if operation['percentile_99'] > error_threshold_ms:
+                p99[f"{dt_start} - {dt_end}"] = operation['percentile_99']
+            summaries.append({"tag": tag, "spikes": p99})
+
+    hdr_table = PrettyTable(["Tag", "Timeframe", "P99"])
+    hdr_table.align = "l"
+    for group in summaries:
+        # Example: group: {'tag': 'READ-st', 'spikes': {'2025-08-25 10:22:40 - 2025-08-25 10:32:40': 487.85}}
+        for time_frame, p99 in group["spikes"].items():
+            hdr_table.add_row([group["tag"], time_frame, p99])
+    click.echo(
+        f"\nFound P99 spikes higher than {error_threshold_ms} ms for tags {hdr_tags} with interval {hdr_summary_interval_sec} seconds\n")
+    click.echo(hdr_table.get_string(title="HDR Latency Spikes"))
+
+
 cli.add_command(sct_ssh.ssh)
 cli.add_command(sct_ssh.tunnel)
 cli.add_command(sct_ssh.copy_cmd)
diff --git a/sdcm/utils/common.py b/sdcm/utils/common.py
@@ -2568,3 +2568,51 @@ def format_size(size_in_bytes):
         if size_in_bytes < 1024:
             return f"{size_in_bytes:.2f} {unit}"
         size_in_bytes /= 1024
+
+
+def get_hdr_tags(stress_tool: str, stress_operation: str, throttled_load: bool) -> list:
+    match stress_tool:
+        case "cassandra-stress":
+            suffix = "-rt" if throttled_load else "-st"
+            if stress_operation == "MIXED":
+                hdr_tags = [f"WRITE{suffix}", f"READ{suffix}"]
+            elif stress_operation == "READ":
+                hdr_tags = [f"READ{suffix}"]
+            elif stress_operation == "WRITE":
+                hdr_tags = [f"WRITE{suffix}"]
+            else:
+                raise ValueError(f"Unsupported stress_operation: {stress_operation}")
+        case "scylla-bench":
+            # TODO: will be defined later
+            raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.")
+        case "latte":
+            # TODO: will be defined later
+            raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.")
+        case _:
+            raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.")
+
+    return hdr_tags
+
+
+def download_and_unpack_logs(test_id: str, log_type: str, download_to: str = None) -> str:
+    logs_links = list_logs_by_test_id(test_id)
+    tmp_dir = download_to or os.path.join('/tmp/', test_id)
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    logs_file = ""
+    for log in logs_links:
+        if log["type"] == log_type:
+            logs_file = S3Storage().download_file(link=log["link"], dst_dir=tmp_dir)
+            LOGGER.debug("Downloaded %slog to %s", log_type, logs_file)
+
+    if not logs_file:
+        raise ValueError("%s not found in argus logs", log_type)
+
+    LOGGER.debug("Unpacking loader logs...")
+    from sdcm.monitorstack import extract_file_from_tar_archive
+    hdr_folder = extract_file_from_tar_archive(pattern=log_type, archive=logs_file, extract_dir=tmp_dir)
+    LOGGER.debug("%s logs unpacked to %s", log_type, hdr_folder[test_id])
+    if not hdr_folder:
+        raise ValueError(f"Failed to unpack logs {logs_file} for test_id {test_id}")
+
+    return hdr_folder[test_id]