Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 1 addition & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,4 @@ the available configuration options are listed in [configuration_options](./docs
### Manager Tests (TODO: write explanation for them)
### [K8S Functional Tests](./docs/k8s-functional-test.md)
### [Microbenchmarking Tests](./docs/microbenchmarking.md)

# Local developer setup

* Requires [uv](https://github.com/astral-sh/uv#installation)

```bash
uv python install
uv venv
export UV_PROJECT_ENVIRONMENT=.venv
uv sync
```
### [Performance Tests](./docs/performance-tests.md)
40 changes: 40 additions & 0 deletions docs/performance-tests.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Performance Tests

## HDR investigate utility

The hdr_investigate utility is essential for performance analysis because it allows users to scan HDR (High Dynamic Range) histogram files
with fine-grained time intervals, rather than only looking at overall or coarse-grained metrics.
By analyzing latency metrics (such as P99) in smaller intervals, the tool helps pinpoint the exact time windows where latency spikes occur.
This makes it possible to correlate these spikes with specific events or Scylla processes, enabling users to identify which Scylla process
or operation is causing performance problems.
This targeted approach greatly improves the efficiency and accuracy of performance troubleshooting in distributed database environments.

Key features:
- Supports multiple stress tools and operations (READ/WRITE).
- Can fetch HDR files from Argus by test ID or use a local folder.
- Allows specifying the time window and scan interval for analysis.
- Reports intervals where P99 latency exceeds a user-defined threshold.

Usage example:

```bash
hydra hdr-investigate \
--stress-operation READ \
--throttled-load true \
--test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 \
--start-time "2025-09-14\ 20:45:18" \
--duration-from-start-min 30
```

Main options:
- --test-id: Test run identifier (fetches logs from Argus if --hdr-folder is not provided).
- --stress-tool: Name of the stress tool (cassandra-stress, scylla-bench, or latte) (default: cassandra-stress).
- --stress-operation: Operation type (READ or WRITE).
- --throttled-load: Whether the load was throttled (True or False).
- --start-time: Start time for analysis (format: YYYY-MM-DD\ HH:MM:SS).
- --duration-from-start-min: Duration in minutes to analyze from the start time.
- --error-threshold-ms: P99 latency threshold in milliseconds (default: 10).
- --hdr-summary-interval-sec: Interval in seconds for summary scan (default: 600).
- --hdr-folder: Path to local folder with HDR files (optional).

This utility is useful for performance engineers and developers investigating latency issues in distributed database clusters.
107 changes: 105 additions & 2 deletions sct.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# Copyright (c) 2021 ScyllaDB

from collections import defaultdict
from datetime import datetime, timezone, timedelta
from datetime import datetime, timezone, timedelta, UTC
import os
import re
import sys
Expand Down Expand Up @@ -51,6 +51,7 @@
from sdcm.sct_provision.instances_provider import provision_sct_resources
from sdcm.sct_runner import AwsSctRunner, GceSctRunner, AzureSctRunner, get_sct_runner, clean_sct_runners, \
update_sct_runner_tags, list_sct_runners

from sdcm.utils.ci_tools import get_job_name, get_job_url
from sdcm.utils.git import get_git_commit_id, get_git_status_info
from sdcm.utils.argus import argus_offline_collect_events, create_proxy_argus_s3_url, get_argus_client
Expand Down Expand Up @@ -83,7 +84,8 @@
list_resources_docker,
list_parallel_timelines_report_urls,
search_test_id_in_latest,
get_latest_scylla_release, images_dict_in_json_format,
get_latest_scylla_release, images_dict_in_json_format, get_hdr_tags,
download_and_unpack_logs,
)
from sdcm.utils.nemesis_generation import generate_nemesis_yaml, NemesisJobGenerator
from sdcm.utils.open_with_diff import OpenWithDiff, ErrorCarrier
Expand Down Expand Up @@ -1933,6 +1935,107 @@ def upload_artifact_file(test_id: str, file_path: str, use_argus: bool, public:
return


@cli.command("hdr-investigate", help="Analyze HDR file for latency spikes.\n"
"Usage example:\n"
"hydra hdr-investigate --stress-operation READ --throttled-load true "
"--test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 --start-time \"2025-09-14\\ 20:45:18\" "
"--duration-from-start-min 30")
@click.option("--test-id", type=str, required=False, help="If hdr_folder is not provided, logs will be downloaded from argus "
"using this test_id")
@click.option("--stress-tool", default="cassandra-stress", required=False,
type=click.Choice(['cassandra-stress', 'scylla-bench', 'latte'], case_sensitive=False),
help="stress tool name. Supported tools: cassandra-stress|scylla-bench|latte")
@click.option("--stress-operation", required=True,
type=click.Choice(["READ", "WRITE"], case_sensitive=False),
help="Supported stress operations: READ|WRITE")
@click.option("--throttled-load", type=bool, required=True, help="Is the load throttled or not")
@click.option("--start-time", type=str, required=True, help="Start time in format 'YYYY-MM-DD\\ HH:MM:SS'")
@click.option("--duration-from-start-min", type=int, required=True,
help="Time period in minutes in HDR file to investigate, started from start-time ")
@click.option("--error-threshold-ms", type=int, default=10, required=False,
help="Error threshold in ms for P99 to consider it as a spike")
@click.option("--hdr-summary-interval-sec", type=int, default=600, required=False, help="Interval in seconds for scan")
@click.option("--hdr-folder", type=str, default=None, required=False, help="Path to folder with hdr files. ")
def hdr_investigate(test_id: str, stress_tool: str, stress_operation: str, throttled_load: bool, start_time: str,
duration_from_start_min: int, error_threshold_ms: int, hdr_summary_interval_sec: int, hdr_folder: str) -> None:
"""
Analyze HDR file for latency spikes.

This function scans HDR files to identify intervals with high latency spikes.
It performs a scan to find intervals with the highest P99 latency
Args:
test_id (str): Test ID. If `hdr_folder` is not provided, logs will be downloaded from Argus using this ID to /tmp/<test-id> folder.
stress_tool (str): Name of the stress tool. Supported: cassandra-stress, scylla-bench, latte.
stress_operation (str): Stress operation type. Supported: READ, WRITE.
throttled_load (bool): Whether the load is throttled.
start_time (str): Start time in format 'YYYY-MM-DD HH:MM:SS'.
duration_from_start_min (int): Time period in minutes in HDR file to investigate, starting from `start_time`.
error_threshold_ms (int): Error threshold in ms for P99 to consider it as a spike.
hdr_summary_interval_sec (int): Interval in seconds for scan.
hdr_folder (str): Path to folder with HDR files. If not provided, logs will be downloaded from Argus.

Returns:
None

Usage example:
hydra hdr-investigate --stress-operation READ --throttled-load true --test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5
--start-time \"2025-09-14\\ 20:45:18\" --duration-from-start-min 30
"""
from sdcm.utils.hdrhistogram import make_hdrhistogram_summary_by_interval
stress_operation = stress_operation.upper()

try:
start_time_ms = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC).timestamp()
except ValueError:
raise ValueError("start_time must be in 'YYYY-MM-DD HH:MM:SS' format")

if not hdr_folder:
if not test_id:
raise ValueError("Either test_id or hdr_folder must be provided")

hdr_folder = download_and_unpack_logs(test_id, log_type='loader-set')

hdr_tags = get_hdr_tags(stress_tool, stress_operation, throttled_load)

# Step 1: Coarse scan
end_time_ms = start_time_ms + duration_from_start_min * 60
hdr_summaries = make_hdrhistogram_summary_by_interval(
hdr_tags, stress_operation, hdr_folder, start_time_ms, end_time_ms, interval=hdr_summary_interval_sec
)

summaries = []
for tag in hdr_tags:
# Find intervals for this tag
tag_hdr_summaries = [
(summary.get(f"{stress_operation}--{tag}", {}))
for summary in hdr_summaries
]
# Filter intervals where percentile_99 > error threshold. Meaning it is a spike
tag_summaries = [
s for s in tag_hdr_summaries if s and "percentile_99" in s and s["percentile_99"] > error_threshold_ms
]
# Sort by percentile_99 descending
tag_summaries.sort(key=lambda x: x["percentile_99"], reverse=True)
for num, operation in enumerate(tag_summaries, start=1):
p99 = {}
# Convert to human readable time
dt_start = datetime.fromtimestamp(operation["start_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S')
dt_end = datetime.fromtimestamp(operation["end_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S')
if operation['percentile_99'] > error_threshold_ms:
p99[f"{dt_start} - {dt_end}"] = operation['percentile_99']
summaries.append({"tag": tag, "spikes": p99})

hdr_table = PrettyTable(["Tag", "Timeframe", "P99"])
hdr_table.align = "l"
for group in summaries:
# Example: group: {'tag': 'READ-st', 'spikes': {'2025-08-25 10:22:40 - 2025-08-25 10:32:40': 487.85}}
for time_frame, p99 in group["spikes"].items():
hdr_table.add_row([group["tag"], time_frame, p99])
click.echo(
f"\nFound P99 spikes higher than {error_threshold_ms} ms for tags {hdr_tags} with interval {hdr_summary_interval_sec} seconds\n")
click.echo(hdr_table.get_string(title="HDR Latency Spikes"))


cli.add_command(sct_ssh.ssh)
cli.add_command(sct_ssh.tunnel)
cli.add_command(sct_ssh.copy_cmd)
Expand Down
48 changes: 48 additions & 0 deletions sdcm/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2568,3 +2568,51 @@ def format_size(size_in_bytes):
if size_in_bytes < 1024:
return f"{size_in_bytes:.2f} {unit}"
size_in_bytes /= 1024


def get_hdr_tags(stress_tool: str, stress_operation: str, throttled_load: bool) -> list:
match stress_tool:
case "cassandra-stress":
suffix = "-rt" if throttled_load else "-st"
if stress_operation == "MIXED":
hdr_tags = [f"WRITE{suffix}", f"READ{suffix}"]
elif stress_operation == "READ":
hdr_tags = [f"READ{suffix}"]
elif stress_operation == "WRITE":
hdr_tags = [f"WRITE{suffix}"]
else:
raise ValueError(f"Unsupported stress_operation: {stress_operation}")
case "scylla-bench":
# TODO: will be defined later
raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.")
case "latte":
# TODO: will be defined later
raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.")
case _:
raise NotImplementedError("get_hdr_tags: 'scylla-bench' is not yet supported.")

return hdr_tags


def download_and_unpack_logs(test_id: str, log_type: str, download_to: str = None) -> str:
logs_links = list_logs_by_test_id(test_id)
tmp_dir = download_to or os.path.join('/tmp/', test_id)
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
logs_file = ""
for log in logs_links:
if log["type"] == log_type:
logs_file = S3Storage().download_file(link=log["link"], dst_dir=tmp_dir)
LOGGER.debug("Downloaded %slog to %s", log_type, logs_file)

if not logs_file:
raise ValueError("%s not found in argus logs", log_type)

LOGGER.debug("Unpacking loader logs...")
from sdcm.monitorstack import extract_file_from_tar_archive
hdr_folder = extract_file_from_tar_archive(pattern=log_type, archive=logs_file, extract_dir=tmp_dir)
LOGGER.debug("%s logs unpacked to %s", log_type, hdr_folder[test_id])
if not hdr_folder:
raise ValueError(f"Failed to unpack logs {logs_file} for test_id {test_id}")

return hdr_folder[test_id]
2 changes: 1 addition & 1 deletion sdcm/utils/hdrhistogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def build_histograms_summary_with_interval(self, path: str,
try:
start_ts = int(self.start_time)
end_ts = int(self.end_time)
if end_ts - start_ts < TIME_INTERVAL:
if end_ts - start_ts < interval:
window_step = int(end_ts - start_ts)
else:
window_step = interval or TIME_INTERVAL
Expand Down