|
13 | 13 | # Copyright (c) 2021 ScyllaDB |
14 | 14 |
|
15 | 15 | from collections import defaultdict |
16 | | -from datetime import datetime, timezone, timedelta |
| 16 | +from datetime import datetime, timezone, timedelta, UTC |
17 | 17 | import os |
18 | 18 | import re |
19 | 19 | import sys |
|
51 | 51 | from sdcm.sct_provision.instances_provider import provision_sct_resources |
52 | 52 | from sdcm.sct_runner import AwsSctRunner, GceSctRunner, AzureSctRunner, get_sct_runner, clean_sct_runners, \ |
53 | 53 | update_sct_runner_tags, list_sct_runners |
| 54 | + |
54 | 55 | from sdcm.utils.ci_tools import get_job_name, get_job_url |
55 | 56 | from sdcm.utils.git import get_git_commit_id, get_git_status_info |
56 | 57 | from sdcm.utils.argus import argus_offline_collect_events, create_proxy_argus_s3_url, get_argus_client |
|
83 | 84 | list_resources_docker, |
84 | 85 | list_parallel_timelines_report_urls, |
85 | 86 | search_test_id_in_latest, |
86 | | - get_latest_scylla_release, images_dict_in_json_format, |
| 87 | + get_latest_scylla_release, images_dict_in_json_format, get_hdr_tags, |
| 88 | + download_and_unpack_logs, |
87 | 89 | ) |
88 | 90 | from sdcm.utils.nemesis_generation import generate_nemesis_yaml, NemesisJobGenerator |
89 | 91 | from sdcm.utils.open_with_diff import OpenWithDiff, ErrorCarrier |
@@ -1933,6 +1935,107 @@ def upload_artifact_file(test_id: str, file_path: str, use_argus: bool, public: |
1933 | 1935 | return |
1934 | 1936 |
|
1935 | 1937 |
|
| 1938 | +@cli.command("hdr-investigate", help="Analyze HDR file for latency spikes.\n" |
| 1939 | + "Usage example:\n" |
| 1940 | + "hydra hdr-investigate --stress-operation READ --throttled-load true " |
| 1941 | + "--test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 --start-time \"2025-09-14\\ 20:45:18\" " |
| 1942 | + "--duration-from-start-min 30") |
| 1943 | +@click.option("--test-id", type=str, required=False, help="If hdr_folder is not provided, logs will be downloaded from argus " |
| 1944 | + "using this test_id") |
| 1945 | +@click.option("--stress-tool", default="cassandra-stress", required=False, |
| 1946 | + type=click.Choice(['cassandra-stress', 'scylla-bench', 'latte'], case_sensitive=False), |
| 1947 | + help="stress tool name. Supported tools: cassandra-stress|scylla-bench|latte") |
| 1948 | +@click.option("--stress-operation", required=True, |
| 1949 | + type=click.Choice(["READ", "WRITE"], case_sensitive=False), |
| 1950 | + help="Supported stress operations: READ|WRITE") |
| 1951 | +@click.option("--throttled-load", type=bool, required=True, help="Is the load throttled or not") |
| 1952 | +@click.option("--start-time", type=str, required=True, help="Start time in format 'YYYY-MM-DD\\ HH:MM:SS'") |
| 1953 | +@click.option("--duration-from-start-min", type=int, required=True, |
| 1954 | + help="Time period in minutes in HDR file to investigate, started from start-time ") |
| 1955 | +@click.option("--error-threshold-ms", type=int, default=10, required=False, |
| 1956 | + help="Error threshold in ms for P99 to consider it as a spike") |
| 1957 | +@click.option("--hdr-summary-interval-sec", type=int, default=600, required=False, help="Interval in seconds for scan") |
| 1958 | +@click.option("--hdr-folder", type=str, default=None, required=False, help="Path to folder with hdr files. ") |
| 1959 | +def hdr_investigate(test_id: str, stress_tool: str, stress_operation: str, throttled_load: bool, start_time: str, |
| 1960 | + duration_from_start_min: int, error_threshold_ms: int, hdr_summary_interval_sec: int, hdr_folder: str) -> None: |
| 1961 | + """ |
| 1962 | + Analyze HDR file for latency spikes. |
| 1963 | +
|
| 1964 | + This function scans HDR files to identify intervals with high latency spikes. |
| 1965 | + It performs a scan to find intervals with the highest P99 latency |
| 1966 | + Args: |
| 1967 | + test_id (str): Test ID. If `hdr_folder` is not provided, logs will be downloaded from Argus using this ID to /tmp/<test-id> folder. |
| 1968 | + stress_tool (str): Name of the stress tool. Supported: cassandra-stress, scylla-bench, latte. |
| 1969 | + stress_operation (str): Stress operation type. Supported: READ, WRITE. |
| 1970 | + throttled_load (bool): Whether the load is throttled. |
| 1971 | + start_time (str): Start time in format 'YYYY-MM-DD HH:MM:SS'. |
| 1972 | + duration_from_start_min (int): Time period in minutes in HDR file to investigate, starting from `start_time`. |
| 1973 | + error_threshold_ms (int): Error threshold in ms for P99 to consider it as a spike. |
| 1974 | + hdr_summary_interval_sec (int): Interval in seconds for scan. |
| 1975 | + hdr_folder (str): Path to folder with HDR files. If not provided, logs will be downloaded from Argus. |
| 1976 | +
|
| 1977 | + Returns: |
| 1978 | + None |
| 1979 | +
|
| 1980 | + Usage example: |
| 1981 | + hydra hdr-investigate --stress-operation READ --throttled-load true --test-id 8732ecb1-7e1f-44e7-b109-6d789b15f4b5 |
| 1982 | + --start-time \"2025-09-14\\ 20:45:18\" --duration-from-start-min 30 |
| 1983 | + """ |
| 1984 | + from sdcm.utils.hdrhistogram import make_hdrhistogram_summary_by_interval |
| 1985 | + stress_operation = stress_operation.upper() |
| 1986 | + |
| 1987 | + try: |
| 1988 | + start_time_ms = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC).timestamp() |
| 1989 | + except ValueError: |
| 1990 | + raise ValueError("start_time must be in 'YYYY-MM-DD HH:MM:SS' format") |
| 1991 | + |
| 1992 | + if not hdr_folder: |
| 1993 | + if not test_id: |
| 1994 | + raise ValueError("Either test_id or hdr_folder must be provided") |
| 1995 | + |
| 1996 | + hdr_folder = download_and_unpack_logs(test_id, log_type='loader-set') |
| 1997 | + |
| 1998 | + hdr_tags = get_hdr_tags(stress_tool, stress_operation, throttled_load) |
| 1999 | + |
| 2000 | + # Step 1: Coarse scan |
| 2001 | + end_time_ms = start_time_ms + duration_from_start_min * 60 |
| 2002 | + hdr_summaries = make_hdrhistogram_summary_by_interval( |
| 2003 | + hdr_tags, stress_operation, hdr_folder, start_time_ms, end_time_ms, interval=hdr_summary_interval_sec |
| 2004 | + ) |
| 2005 | + |
| 2006 | + summaries = [] |
| 2007 | + for tag in hdr_tags: |
| 2008 | + # Find intervals for this tag |
| 2009 | + tag_hdr_summaries = [ |
| 2010 | + (summary.get(f"{stress_operation}--{tag}", {})) |
| 2011 | + for summary in hdr_summaries |
| 2012 | + ] |
| 2013 | + # Filter intervals where percentile_99 > error threshold. Meaning it is a spike |
| 2014 | + tag_summaries = [ |
| 2015 | + s for s in tag_hdr_summaries if s and "percentile_99" in s and s["percentile_99"] > error_threshold_ms |
| 2016 | + ] |
| 2017 | + # Sort by percentile_99 descending |
| 2018 | + tag_summaries.sort(key=lambda x: x["percentile_99"], reverse=True) |
| 2019 | + for num, operation in enumerate(tag_summaries, start=1): |
| 2020 | + p99 = {} |
| 2021 | + # Convert to human readable time |
| 2022 | + dt_start = datetime.fromtimestamp(operation["start_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S') |
| 2023 | + dt_end = datetime.fromtimestamp(operation["end_time"] / 1000, UTC).strftime('%Y-%m-%d %H:%M:%S') |
| 2024 | + if operation['percentile_99'] > error_threshold_ms: |
| 2025 | + p99[f"{dt_start} - {dt_end}"] = operation['percentile_99'] |
| 2026 | + summaries.append({"tag": tag, "spikes": p99}) |
| 2027 | + |
| 2028 | + hdr_table = PrettyTable(["Tag", "Timeframe", "P99"]) |
| 2029 | + hdr_table.align = "l" |
| 2030 | + for group in summaries: |
| 2031 | + # Example: group: {'tag': 'READ-st', 'spikes': {'2025-08-25 10:22:40 - 2025-08-25 10:32:40': 487.85}} |
| 2032 | + for time_frame, p99 in group["spikes"].items(): |
| 2033 | + hdr_table.add_row([group["tag"], time_frame, p99]) |
| 2034 | + click.echo( |
| 2035 | + f"\nFound P99 spikes higher than {error_threshold_ms} ms for tags {hdr_tags} with interval {hdr_summary_interval_sec} seconds\n") |
| 2036 | + click.echo(hdr_table.get_string(title="HDR Latency Spikes")) |
| 2037 | + |
| 2038 | + |
1936 | 2039 | cli.add_command(sct_ssh.ssh) |
1937 | 2040 | cli.add_command(sct_ssh.tunnel) |
1938 | 2041 | cli.add_command(sct_ssh.copy_cmd) |
|
0 commit comments