diff --git a/redis_benchmarks_specification/__compare__/args.py b/redis_benchmarks_specification/__compare__/args.py
index 5e014b9..c47f3ac 100644
--- a/redis_benchmarks_specification/__compare__/args.py
+++ b/redis_benchmarks_specification/__compare__/args.py
@@ -230,4 +230,11 @@ def create_compare_arguments(parser):
action="store_true",
help="Skip interactive approval of changes to github before applying.",
)
+ parser.add_argument(
+ "--compare-by-env",
+ required=False,
+ default=False,
+ action="store_true",
+ help="Compare by environments for each test instead of comparing by tests for each environment (default behavior).",
+ )
return parser
diff --git a/redis_benchmarks_specification/__compare__/compare.py b/redis_benchmarks_specification/__compare__/compare.py
index ab7c7b3..1a1bd30 100644
--- a/redis_benchmarks_specification/__compare__/compare.py
+++ b/redis_benchmarks_specification/__compare__/compare.py
@@ -367,66 +367,140 @@ def compare_command_logic(args, project_name, project_version):
)
grafana_link_base = "https://benchmarksredisio.grafana.net/d/1fWbtb7nz/experimental-oss-spec-benchmarks"
- (
- detected_regressions,
- table_output,
- improvements_list,
- regressions_list,
- total_stable,
- total_unstable,
- total_comparison_points,
- boxplot_data,
- command_change,
- ) = compute_regression_table(
- rts,
- tf_github_org,
- tf_github_repo,
- triggering_env_baseline,
- triggering_env_comparison,
- metric_name,
- comparison_branch,
- baseline_branch,
- baseline_tag,
- comparison_tag,
- baseline_deployment_name,
- comparison_deployment_name,
- print_improvements_only,
- print_regressions_only,
- skip_unstable,
- regressions_percent_lower_limit,
- simplify_table,
- test,
- testname_regex,
- verbose,
- last_n_baseline,
- last_n_comparison,
- metric_mode,
- from_date,
- from_ts_ms,
- to_date,
- to_ts_ms,
- use_metric_context_path,
- running_platform_baseline,
- running_platform_comparison,
- baseline_target_version,
- comparison_target_version,
- baseline_hash,
- comparison_hash,
- baseline_github_repo,
- comparison_github_repo,
- baseline_target_branch,
- comparison_target_branch,
- baseline_github_org,
- comparison_github_org,
- args.regression_str,
- args.improvement_str,
- tests_with_config,
- args.use_test_suites_folder,
- testsuites_folder,
- args.extra_filters,
- getattr(args, "command_group_regex", ".*"),
- getattr(args, "command_regex", ".*"),
- )
+ # Check if environment comparison mode is enabled
+ compare_by_env = getattr(args, "compare_by_env", False)
+
+ if compare_by_env:
+ logging.info("Environment comparison mode enabled")
+ # Extract environment list from deployment names in the user's example
+ env_list = [
+ "oss-standalone",
+ "oss-standalone-02-io-threads",
+ "oss-standalone-04-io-threads",
+ "oss-standalone-08-io-threads",
+ ]
+
+ (
+ detected_regressions,
+ table_output,
+ improvements_list,
+ regressions_list,
+ total_stable,
+ total_unstable,
+ total_comparison_points,
+ boxplot_data,
+ command_change,
+ ) = compute_env_comparison_table(
+ rts,
+ tf_github_org,
+ tf_github_repo,
+ triggering_env_baseline,
+ triggering_env_comparison,
+ metric_name,
+ comparison_branch,
+ baseline_branch,
+ baseline_tag,
+ comparison_tag,
+ env_list,
+ print_improvements_only,
+ print_regressions_only,
+ skip_unstable,
+ regressions_percent_lower_limit,
+ simplify_table,
+ test,
+ testname_regex,
+ verbose,
+ last_n_baseline,
+ last_n_comparison,
+ metric_mode,
+ from_date,
+ from_ts_ms,
+ to_date,
+ to_ts_ms,
+ use_metric_context_path,
+ running_platform_baseline,
+ running_platform_comparison,
+ baseline_target_version,
+ comparison_target_version,
+ baseline_hash,
+ comparison_hash,
+ baseline_github_repo,
+ comparison_github_repo,
+ baseline_target_branch,
+ comparison_target_branch,
+ baseline_github_org,
+ comparison_github_org,
+ args.regression_str,
+ args.improvement_str,
+ tests_with_config,
+ args.use_test_suites_folder,
+ testsuites_folder,
+ args.extra_filters,
+ getattr(args, "command_group_regex", ".*"),
+ getattr(args, "command_regex", ".*"),
+ )
+ else:
+ logging.info("Default test comparison mode")
+ (
+ detected_regressions,
+ table_output,
+ improvements_list,
+ regressions_list,
+ total_stable,
+ total_unstable,
+ total_comparison_points,
+ boxplot_data,
+ command_change,
+ ) = compute_regression_table(
+ rts,
+ tf_github_org,
+ tf_github_repo,
+ triggering_env_baseline,
+ triggering_env_comparison,
+ metric_name,
+ comparison_branch,
+ baseline_branch,
+ baseline_tag,
+ comparison_tag,
+ baseline_deployment_name,
+ comparison_deployment_name,
+ print_improvements_only,
+ print_regressions_only,
+ skip_unstable,
+ regressions_percent_lower_limit,
+ simplify_table,
+ test,
+ testname_regex,
+ verbose,
+ last_n_baseline,
+ last_n_comparison,
+ metric_mode,
+ from_date,
+ from_ts_ms,
+ to_date,
+ to_ts_ms,
+ use_metric_context_path,
+ running_platform_baseline,
+ running_platform_comparison,
+ baseline_target_version,
+ comparison_target_version,
+ baseline_hash,
+ comparison_hash,
+ baseline_github_repo,
+ comparison_github_repo,
+ baseline_target_branch,
+ comparison_target_branch,
+ baseline_github_org,
+ comparison_github_org,
+ args.regression_str,
+ args.improvement_str,
+ tests_with_config,
+ args.use_test_suites_folder,
+ testsuites_folder,
+ args.extra_filters,
+ getattr(args, "command_group_regex", ".*"),
+ getattr(args, "command_regex", ".*"),
+ )
total_regressions = len(regressions_list)
total_improvements = len(improvements_list)
prepare_regression_comment(
@@ -696,7 +770,7 @@ def extract_default_branch_and_metric(defaults_filename):
return default_baseline_branch, default_metrics_str
-def compute_regression_table(
+def compute_env_comparison_table(
rts,
tf_github_org,
tf_github_repo,
@@ -707,8 +781,7 @@ def compute_regression_table(
baseline_branch="unstable",
baseline_tag=None,
comparison_tag=None,
- baseline_deployment_name="oss-standalone",
- comparison_deployment_name="oss-standalone",
+ env_list=None,
print_improvements_only=False,
print_regressions_only=False,
skip_unstable=False,
@@ -746,6 +819,9 @@ def compute_regression_table(
command_group_regex=".*",
command_regex=".*",
):
+ """
+ Compute environment comparison table for a specific test across different environments.
+ """
START_TIME_NOW_UTC, _, _ = get_start_time_vars()
START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31)
if from_date is None:
@@ -756,13 +832,38 @@ def compute_regression_table(
from_ts_ms = int(from_date.timestamp() * 1000)
if to_ts_ms is None:
to_ts_ms = int(to_date.timestamp() * 1000)
- from_human_str = humanize.naturaltime(
- dt.datetime.utcfromtimestamp(from_ts_ms / 1000)
- )
- to_human_str = humanize.naturaltime(dt.datetime.utcfromtimestamp(to_ts_ms / 1000))
- logging.info(
- "Using a time-delta from {} to {}".format(from_human_str, to_human_str)
- )
+
+ # Extract test names from the test parameter
+ test_names = []
+ if test != "":
+ test_names = test.split(",")
+
+ # If no specific test provided, we need at least one test for environment comparison
+ if not test_names:
+ logging.error(
+ "Environment comparison requires specifying at least one test via --test parameter"
+ )
+ return None, "", [], [], 0, 0, 0, [], False
+
+ # For environment comparison, we focus on the first test if multiple are provided
+ test_name = test_names[0]
+ if len(test_names) > 1:
+ logging.warning(
+ f"Environment comparison mode: using only the first test '{test_name}' from the provided list"
+ )
+
+ # Default environment list if not provided
+ if env_list is None:
+ env_list = [
+ "oss-standalone",
+ "oss-standalone-02-io-threads",
+ "oss-standalone-04-io-threads",
+ "oss-standalone-08-io-threads",
+ ]
+
+ logging.info(f"Comparing environments {env_list} for test '{test_name}'")
+
+ # Build baseline and comparison strings
baseline_str, by_str_baseline, comparison_str, by_str_comparison = get_by_strings(
baseline_branch,
comparison_branch,
@@ -770,52 +871,13 @@ def compute_regression_table(
comparison_tag,
baseline_target_version,
comparison_target_version,
- comparison_hash,
baseline_hash,
+ comparison_hash,
baseline_target_branch,
comparison_target_branch,
)
- logging.info(f"Using baseline filter {by_str_baseline}={baseline_str}")
- logging.info(f"Using comparison filter {by_str_comparison}={comparison_str}")
- (
- prefix,
- testcases_setname,
- _,
- tsname_project_total_failures,
- tsname_project_total_success,
- _,
- _,
- _,
- testcases_metric_context_path_setname,
- _,
- _,
- _,
- _,
- _,
- ) = get_overall_dashboard_keynames(
- tf_github_org, tf_github_repo, tf_triggering_env_baseline
- )
- test_names = []
- used_key = testcases_setname
- test_filter = "test_name"
- if use_metric_context_path:
- test_filter = "test_name:metric_context_path"
- used_key = testcases_metric_context_path_setname
- tags_regex_string = re.compile(testname_regex)
- if test != "":
- test_names = test.split(",")
- logging.info("Using test name {}".format(test_names))
- elif use_test_suites_folder:
- test_names = get_test_names_from_yaml_files(
- test_suites_folder, tags_regex_string
- )
- else:
- test_names = get_test_names_from_db(
- rts, tags_regex_string, test_names, used_key
- )
- # Apply command regex filtering to tests_with_config
- tests_with_config = filter_tests_by_command_regex(tests_with_config, command_regex)
+ test_filter = "test_name"
(
detected_regressions,
@@ -838,9 +900,9 @@ def compute_regression_table(
group_change,
command_change,
boxplot_data,
- ) = from_rts_to_regression_table(
- baseline_deployment_name,
- comparison_deployment_name,
+ ) = from_rts_to_env_comparison_table(
+ test_name,
+ env_list,
baseline_str,
comparison_str,
by_str_baseline,
@@ -858,7 +920,6 @@ def compute_regression_table(
rts,
simplify_table,
test_filter,
- test_names,
tf_triggering_env_baseline,
tf_triggering_env_comparison,
verbose,
@@ -873,64 +934,23 @@ def compute_regression_table(
tests_with_config,
extra_filters,
)
- logging.info(
- "Printing differential analysis between {} and {}".format(
- baseline_str, comparison_str
- )
- )
-
- table_output = "# Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n\n".format(
- baseline_str,
- comparison_str,
- from_human_str,
- baseline_deployment_name,
- )
- table_output += "\n By GROUP change csv:
\n\n"
- table_output += (
- "\ncommand_group,min_change,q1_change,median_change,q3_change,max_change \n"
- )
- for group_name, changes_list in group_change.items():
- min_change = min(changes_list)
- q1_change = np.percentile(changes_list, 25)
- median_change = np.median(changes_list)
- q3_change = np.percentile(changes_list, 75)
- max_change = max(changes_list)
- table_output += f"{group_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n"
- table_output += "\n \n"
- table_output += "\n\n"
- table_output += "\n By COMMAND change csv:
\n\n"
+ # Generate table output
+ table_output = f"## Environment Comparison for Test: {test_name}\n\n"
+ table_output += f"**Metric:** {metric_name} ({metric_mode})\n\n"
table_output += (
- "\ncommand,min_change,q1_change,median_change,q3_change,max_change \n"
+ f"**Baseline:** {baseline_github_org}/{baseline_github_repo} {baseline_str}\n\n"
)
-
- # Filter commands by command group regex if specified
- filtered_command_change = command_change
- if command_group_regex != ".*":
- group_regex = re.compile(command_group_regex)
- filtered_command_change = {}
- for command_name, changes_list in command_change.items():
- command_group = categorize_command(command_name.lower())
- if re.search(group_regex, command_group):
- filtered_command_change[command_name] = changes_list
-
- for command_name, changes_list in filtered_command_change.items():
- min_change = min(changes_list)
- q1_change = np.percentile(changes_list, 25)
- median_change = np.median(changes_list)
- q3_change = np.percentile(changes_list, 75)
- max_change = max(changes_list)
- table_output += f"{command_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n"
- table_output += "\n \n"
+ table_output += f"**Comparison:** {comparison_github_org}/{comparison_github_repo} {comparison_str}\n\n"
if total_unstable > 0:
old_stdout = sys.stdout
sys.stdout = mystdout = StringIO()
- table_output += "#### Unstable Table\n\n"
+ table_output += "#### Unstable Environments\n\n"
writer_regressions = MarkdownTableWriter(
table_name="",
headers=[
- "Test Case",
+ "Environment",
f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
"% change ({})".format(metric_mode),
@@ -941,19 +961,19 @@ def compute_regression_table(
writer_regressions.dump(mystdout, False)
table_output += mystdout.getvalue()
table_output += "\n\n"
- test_names_str = "|".join([l[0] for l in unstable_list])
- table_output += f"Unstable test regexp names: {test_names_str}\n\n"
+ env_names_str = "|".join([l[0] for l in unstable_list])
+ table_output += f"Unstable environment names: {env_names_str}\n\n"
mystdout.close()
sys.stdout = old_stdout
if total_regressions > 0:
old_stdout = sys.stdout
sys.stdout = mystdout = StringIO()
- table_output += "#### Regressions Table\n\n"
+ table_output += "#### Regressions by Environment\n\n"
writer_regressions = MarkdownTableWriter(
table_name="",
headers=[
- "Test Case",
+ "Environment",
f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
"% change ({})".format(metric_mode),
@@ -964,19 +984,19 @@ def compute_regression_table(
writer_regressions.dump(mystdout, False)
table_output += mystdout.getvalue()
table_output += "\n\n"
- test_names_str = "|".join([l[0] for l in regressions_list])
- table_output += f"Regressions test regexp names: {test_names_str}\n\n"
+ env_names_str = "|".join([l[0] for l in regressions_list])
+ table_output += f"Regression environment names: {env_names_str}\n\n"
mystdout.close()
sys.stdout = old_stdout
if total_improvements > 0:
old_stdout = sys.stdout
sys.stdout = mystdout = StringIO()
- table_output += "#### Improvements Table\n\n"
+ table_output += "#### Improvements by Environment\n\n"
writer_regressions = MarkdownTableWriter(
table_name="",
headers=[
- "Test Case",
+ "Environment",
f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
"% change ({})".format(metric_mode),
@@ -987,8 +1007,8 @@ def compute_regression_table(
writer_regressions.dump(mystdout, False)
table_output += mystdout.getvalue()
table_output += "\n\n"
- test_names_str = "|".join([l[0] for l in improvements_list])
- table_output += f"Improvements test regexp names: {test_names_str}\n\n"
+ env_names_str = "|".join([l[0] for l in improvements_list])
+ table_output += f"Improvements environment names: {env_names_str}\n\n"
mystdout.close()
sys.stdout = old_stdout
@@ -997,7 +1017,7 @@ def compute_regression_table(
writer_full = MarkdownTableWriter(
table_name="",
headers=[
- "Test Case",
+ "Environment",
f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
"% change ({})".format(metric_mode),
@@ -1005,36 +1025,37 @@ def compute_regression_table(
],
value_matrix=table_full,
)
- table_output += "\n Full Results table:
\n\n"
+ table_output += f"\n Full Environment Results for Test: {test_name}:
\n\n"
writer_full.dump(mystdout, False)
sys.stdout = old_stdout
table_output += mystdout.getvalue()
table_output += "\n \n"
+
len_baseline_only_list = len(baseline_only_list)
if len_baseline_only_list > 0:
- table_output += f"\n WARNING: There were {len_baseline_only_list} benchmarks with datapoints only on baseline.\n\n"
- baseline_only_test_names_str = "|".join([l for l in baseline_only_list])
+ table_output += f"\n WARNING: There were {len_baseline_only_list} environments with datapoints only on baseline.\n\n"
+ baseline_only_env_names_str = "|".join([l for l in baseline_only_list])
table_output += (
- f" Baseline only test regexp names: {baseline_only_test_names_str}\n\n"
+ f" Baseline only environment names: {baseline_only_env_names_str}\n\n"
)
+
len_comparison_only_list = len(comparison_only_list)
if len_comparison_only_list > 0:
- table_output += f"\n WARNING: There were {len_comparison_only_list} benchmarks with datapoints only on comparison.\n\n"
- comparison_only_test_names_str = "|".join([l for l in comparison_only_list])
+ table_output += f"\n WARNING: There were {len_comparison_only_list} environments with datapoints only on comparison.\n\n"
+ comparison_only_env_names_str = "|".join([l for l in comparison_only_list])
table_output += (
- f" Comparison only test regexp names: {comparison_only_test_names_str}\n\n"
+ f" Comparison only environment names: {comparison_only_env_names_str}\n\n"
)
- len_no_datapoints = len(no_datapoints_list)
- if len_no_datapoints > 0:
- table_output += f"\n WARNING: There were {len_no_datapoints} benchmarks with NO datapoints for both baseline and comparison.\n\n"
- table_output += "\n NO datapoints for both baseline and comparison:
\n\n"
- no_datapoints_test_names_str = "|".join([l for l in no_datapoints_list])
+
+ len_no_datapoints_list = len(no_datapoints_list)
+ if len_no_datapoints_list > 0:
+ table_output += f"\n WARNING: There were {len_no_datapoints_list} environments with no datapoints.\n\n"
+ no_datapoints_env_names_str = "|".join([l for l in no_datapoints_list])
table_output += (
- f" NO DATAPOINTS test regexp names: {no_datapoints_test_names_str}\n\n"
+ f" No datapoints environment names: {no_datapoints_env_names_str}\n\n"
)
- table_output += "\n \n"
return (
detected_regressions,
@@ -1049,141 +1070,866 @@ def compute_regression_table(
)
-def get_by_error(name, by_str_arr):
- by_string = ",".join(by_str_arr)
- return f"--{name}-branch, --{name}-tag, --{name}-target-branch, --{name}-hash, and --{name}-target-version are mutually exclusive. You selected a total of {len(by_str_arr)}: {by_string}. Pick one..."
-
-
-def get_by_strings(
- baseline_branch,
+def compute_regression_table(
+ rts,
+ tf_github_org,
+ tf_github_repo,
+ tf_triggering_env_baseline,
+ tf_triggering_env_comparison,
+ metric_name,
comparison_branch,
- baseline_tag,
- comparison_tag,
+ baseline_branch="unstable",
+ baseline_tag=None,
+ comparison_tag=None,
+ baseline_deployment_name="oss-standalone",
+ comparison_deployment_name="oss-standalone",
+ print_improvements_only=False,
+ print_regressions_only=False,
+ skip_unstable=False,
+ regressions_percent_lower_limit=5.0,
+ simplify_table=False,
+ test="",
+ testname_regex=".*",
+ verbose=False,
+ last_n_baseline=-1,
+ last_n_comparison=-1,
+ metric_mode="higher-better",
+ from_date=None,
+ from_ts_ms=None,
+ to_date=None,
+ to_ts_ms=None,
+ use_metric_context_path=None,
+ running_platform_baseline=None,
+ running_platform_comparison=None,
baseline_target_version=None,
comparison_target_version=None,
- baseline_hash=None,
comparison_hash=None,
+ baseline_hash=None,
+ baseline_github_repo="redis",
+ comparison_github_repo="redis",
baseline_target_branch=None,
comparison_target_branch=None,
+ baseline_github_org="redis",
+ comparison_github_org="redis",
+ regression_str="REGRESSION",
+ improvement_str="IMPROVEMENT",
+ tests_with_config={},
+ use_test_suites_folder=False,
+ test_suites_folder=None,
+ extra_filters="",
+ command_group_regex=".*",
+ command_regex=".*",
):
- baseline_covered = False
- comparison_covered = False
- by_str_baseline = ""
- by_str_comparison = ""
- baseline_str = ""
- comparison_str = ""
- baseline_by_arr = []
- comparison_by_arr = []
+ START_TIME_NOW_UTC, _, _ = get_start_time_vars()
+ START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31)
+ if from_date is None:
+ from_date = START_TIME_LAST_MONTH_UTC
+ if to_date is None:
+ to_date = START_TIME_NOW_UTC
+ if from_ts_ms is None:
+ from_ts_ms = int(from_date.timestamp() * 1000)
+ if to_ts_ms is None:
+ to_ts_ms = int(to_date.timestamp() * 1000)
+ from_human_str = humanize.naturaltime(
+ dt.datetime.utcfromtimestamp(from_ts_ms / 1000)
+ )
+ to_human_str = humanize.naturaltime(dt.datetime.utcfromtimestamp(to_ts_ms / 1000))
+ logging.info(
+ "Using a time-delta from {} to {}".format(from_human_str, to_human_str)
+ )
+ baseline_str, by_str_baseline, comparison_str, by_str_comparison = get_by_strings(
+ baseline_branch,
+ comparison_branch,
+ baseline_tag,
+ comparison_tag,
+ baseline_target_version,
+ comparison_target_version,
+ comparison_hash,
+ baseline_hash,
+ baseline_target_branch,
+ comparison_target_branch,
+ )
+ logging.info(f"Using baseline filter {by_str_baseline}={baseline_str}")
+ logging.info(f"Using comparison filter {by_str_comparison}={comparison_str}")
+ (
+ prefix,
+ testcases_setname,
+ _,
+ tsname_project_total_failures,
+ tsname_project_total_success,
+ _,
+ _,
+ _,
+ testcases_metric_context_path_setname,
+ _,
+ _,
+ _,
+ _,
+ _,
+ ) = get_overall_dashboard_keynames(
+ tf_github_org, tf_github_repo, tf_triggering_env_baseline
+ )
+ test_names = []
+ used_key = testcases_setname
+ test_filter = "test_name"
+ if use_metric_context_path:
+ test_filter = "test_name:metric_context_path"
+ used_key = testcases_metric_context_path_setname
+ tags_regex_string = re.compile(testname_regex)
+ if test != "":
+ test_names = test.split(",")
+ logging.info("Using test name {}".format(test_names))
+ elif use_test_suites_folder:
+ test_names = get_test_names_from_yaml_files(
+ test_suites_folder, tags_regex_string
+ )
+ else:
+ test_names = get_test_names_from_db(
+ rts, tags_regex_string, test_names, used_key
+ )
- ################# BASELINE BY ....
+ # Apply command regex filtering to tests_with_config
+ tests_with_config = filter_tests_by_command_regex(tests_with_config, command_regex)
- if baseline_branch is not None:
- by_str_baseline = "branch"
- baseline_covered = True
- baseline_str = baseline_branch
- baseline_by_arr.append(by_str_baseline)
+ (
+ detected_regressions,
+ table_full,
+ table_stable,
+ table_unstable,
+ table_improvements,
+ table_regressions,
+ total_improvements,
+ total_regressions,
+ total_stable,
+ total_unstable,
+ total_comparison_points,
+ regressions_list,
+ improvements_list,
+ unstable_list,
+ baseline_only_list,
+ comparison_only_list,
+ no_datapoints_list,
+ group_change,
+ command_change,
+ boxplot_data,
+ ) = from_rts_to_regression_table(
+ baseline_deployment_name,
+ comparison_deployment_name,
+ baseline_str,
+ comparison_str,
+ by_str_baseline,
+ by_str_comparison,
+ from_ts_ms,
+ to_ts_ms,
+ last_n_baseline,
+ last_n_comparison,
+ metric_mode,
+ metric_name,
+ print_improvements_only,
+ print_regressions_only,
+ skip_unstable,
+ regressions_percent_lower_limit,
+ rts,
+ simplify_table,
+ test_filter,
+ test_names,
+ tf_triggering_env_baseline,
+ tf_triggering_env_comparison,
+ verbose,
+ running_platform_baseline,
+ running_platform_comparison,
+ baseline_github_repo,
+ comparison_github_repo,
+ baseline_github_org,
+ comparison_github_org,
+ regression_str,
+ improvement_str,
+ tests_with_config,
+ extra_filters,
+ )
+ logging.info(
+ "Printing differential analysis between {} and {}".format(
+ baseline_str, comparison_str
+ )
+ )
- if baseline_tag is not None:
- by_str_baseline = "version"
- if baseline_covered:
- baseline_by_arr.append(by_str_baseline)
- logging.error(get_by_error("baseline", baseline_by_arr))
- exit(1)
- baseline_covered = True
- baseline_str = baseline_tag
+ table_output = "# Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n\n".format(
+ baseline_str,
+ comparison_str,
+ from_human_str,
+ baseline_deployment_name,
+ )
+
+ table_output += "\n By GROUP change csv:
\n\n"
+ table_output += (
+ "\ncommand_group,min_change,q1_change,median_change,q3_change,max_change \n"
+ )
+ for group_name, changes_list in group_change.items():
+ min_change = min(changes_list)
+ q1_change = np.percentile(changes_list, 25)
+ median_change = np.median(changes_list)
+ q3_change = np.percentile(changes_list, 75)
+ max_change = max(changes_list)
+ table_output += f"{group_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n"
+ table_output += "\n \n"
+ table_output += "\n\n"
+ table_output += "\n By COMMAND change csv:
\n\n"
+ table_output += (
+ "\ncommand,min_change,q1_change,median_change,q3_change,max_change \n"
+ )
+
+ # Filter commands by command group regex if specified
+ filtered_command_change = command_change
+ if command_group_regex != ".*":
+ group_regex = re.compile(command_group_regex)
+ filtered_command_change = {}
+ for command_name, changes_list in command_change.items():
+ command_group = categorize_command(command_name.lower())
+ if re.search(group_regex, command_group):
+ filtered_command_change[command_name] = changes_list
+
+ for command_name, changes_list in filtered_command_change.items():
+ min_change = min(changes_list)
+ q1_change = np.percentile(changes_list, 25)
+ median_change = np.median(changes_list)
+ q3_change = np.percentile(changes_list, 75)
+ max_change = max(changes_list)
+ table_output += f"{command_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n"
+ table_output += "\n \n"
+
+ if total_unstable > 0:
+ old_stdout = sys.stdout
+ sys.stdout = mystdout = StringIO()
+ table_output += "#### Unstable Table\n\n"
+ writer_regressions = MarkdownTableWriter(
+ table_name="",
+ headers=[
+ "Test Case",
+ f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
+ f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
+ "% change ({})".format(metric_mode),
+ "Note",
+ ],
+ value_matrix=table_unstable,
+ )
+ writer_regressions.dump(mystdout, False)
+ table_output += mystdout.getvalue()
+ table_output += "\n\n"
+ test_names_str = "|".join([l[0] for l in unstable_list])
+ table_output += f"Unstable test regexp names: {test_names_str}\n\n"
+ mystdout.close()
+ sys.stdout = old_stdout
+
+ if total_regressions > 0:
+ old_stdout = sys.stdout
+ sys.stdout = mystdout = StringIO()
+ table_output += "#### Regressions Table\n\n"
+ writer_regressions = MarkdownTableWriter(
+ table_name="",
+ headers=[
+ "Test Case",
+ f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
+ f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
+ "% change ({})".format(metric_mode),
+ "Note",
+ ],
+ value_matrix=table_regressions,
+ )
+ writer_regressions.dump(mystdout, False)
+ table_output += mystdout.getvalue()
+ table_output += "\n\n"
+ test_names_str = "|".join([l[0] for l in regressions_list])
+ table_output += f"Regressions test regexp names: {test_names_str}\n\n"
+ mystdout.close()
+ sys.stdout = old_stdout
+
+ if total_improvements > 0:
+ old_stdout = sys.stdout
+ sys.stdout = mystdout = StringIO()
+ table_output += "#### Improvements Table\n\n"
+ writer_regressions = MarkdownTableWriter(
+ table_name="",
+ headers=[
+ "Test Case",
+ f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
+ f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
+ "% change ({})".format(metric_mode),
+ "Note",
+ ],
+ value_matrix=table_improvements,
+ )
+ writer_regressions.dump(mystdout, False)
+ table_output += mystdout.getvalue()
+ table_output += "\n\n"
+ test_names_str = "|".join([l[0] for l in improvements_list])
+ table_output += f"Improvements test regexp names: {test_names_str}\n\n"
+ mystdout.close()
+ sys.stdout = old_stdout
+
+ old_stdout = sys.stdout
+ sys.stdout = mystdout = StringIO()
+ writer_full = MarkdownTableWriter(
+ table_name="",
+ headers=[
+ "Test Case",
+ f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)",
+ f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)",
+ "% change ({})".format(metric_mode),
+ "Note",
+ ],
+ value_matrix=table_full,
+ )
+ table_output += "\n Full Results table:
\n\n"
+
+ writer_full.dump(mystdout, False)
+
+ sys.stdout = old_stdout
+ table_output += mystdout.getvalue()
+ table_output += "\n \n"
+ len_baseline_only_list = len(baseline_only_list)
+ if len_baseline_only_list > 0:
+ table_output += f"\n WARNING: There were {len_baseline_only_list} benchmarks with datapoints only on baseline.\n\n"
+ baseline_only_test_names_str = "|".join([l for l in baseline_only_list])
+ table_output += (
+ f" Baseline only test regexp names: {baseline_only_test_names_str}\n\n"
+ )
+ len_comparison_only_list = len(comparison_only_list)
+ if len_comparison_only_list > 0:
+ table_output += f"\n WARNING: There were {len_comparison_only_list} benchmarks with datapoints only on comparison.\n\n"
+ comparison_only_test_names_str = "|".join([l for l in comparison_only_list])
+ table_output += (
+ f" Comparison only test regexp names: {comparison_only_test_names_str}\n\n"
+ )
+ len_no_datapoints = len(no_datapoints_list)
+ if len_no_datapoints > 0:
+ table_output += f"\n WARNING: There were {len_no_datapoints} benchmarks with NO datapoints for both baseline and comparison.\n\n"
+ table_output += "\n NO datapoints for both baseline and comparison:
\n\n"
+ no_datapoints_test_names_str = "|".join([l for l in no_datapoints_list])
+ table_output += (
+ f" NO DATAPOINTS test regexp names: {no_datapoints_test_names_str}\n\n"
+ )
+ table_output += "\n \n"
+
+ return (
+ detected_regressions,
+ table_output,
+ improvements_list,
+ regressions_list,
+ total_stable,
+ total_unstable,
+ total_comparison_points,
+ boxplot_data,
+ command_change,
+ )
+
+
+def get_by_error(name, by_str_arr):
+ by_string = ",".join(by_str_arr)
+ return f"--{name}-branch, --{name}-tag, --{name}-target-branch, --{name}-hash, and --{name}-target-version are mutually exclusive. You selected a total of {len(by_str_arr)}: {by_string}. Pick one..."
+
+
+def get_by_strings(
+ baseline_branch,
+ comparison_branch,
+ baseline_tag,
+ comparison_tag,
+ baseline_target_version=None,
+ comparison_target_version=None,
+ baseline_hash=None,
+ comparison_hash=None,
+ baseline_target_branch=None,
+ comparison_target_branch=None,
+):
+ baseline_covered = False
+ comparison_covered = False
+ by_str_baseline = ""
+ by_str_comparison = ""
+ baseline_str = ""
+ comparison_str = ""
+ baseline_by_arr = []
+ comparison_by_arr = []
+
+ ################# BASELINE BY ....
+
+ if baseline_branch is not None:
+ by_str_baseline = "branch"
+ baseline_covered = True
+ baseline_str = baseline_branch
+ baseline_by_arr.append(by_str_baseline)
+
+ if baseline_tag is not None:
+ by_str_baseline = "version"
+ if baseline_covered:
+ baseline_by_arr.append(by_str_baseline)
+ logging.error(get_by_error("baseline", baseline_by_arr))
+ exit(1)
+ baseline_covered = True
+ baseline_str = baseline_tag
+
+ if baseline_target_version is not None:
+ by_str_baseline = "target+version"
+ if baseline_covered:
+ baseline_by_arr.append(by_str_baseline)
+ logging.error(get_by_error("baseline", baseline_by_arr))
+ exit(1)
+ baseline_covered = True
+ baseline_str = baseline_target_version
+
+ if baseline_hash is not None:
+ by_str_baseline = "hash"
+ if baseline_covered:
+ baseline_by_arr.append(by_str_baseline)
+ logging.error(get_by_error("baseline", baseline_by_arr))
+ exit(1)
+ baseline_covered = True
+ baseline_str = baseline_hash
+ if baseline_target_branch is not None:
+ by_str_baseline = "target+branch"
+ if baseline_covered:
+ baseline_by_arr.append(by_str_baseline)
+ logging.error(get_by_error("baseline", baseline_by_arr))
+ exit(1)
+ baseline_covered = True
+ baseline_str = baseline_target_branch
+
+ ################# COMPARISON BY ....
+
+ if comparison_branch is not None:
+ by_str_comparison = "branch"
+ comparison_covered = True
+ comparison_str = comparison_branch
+
+ if comparison_tag is not None:
+ # check if we had already covered comparison
+ if comparison_covered:
+ logging.error(
+ "--comparison-branch and --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
+ )
+ exit(1)
+ comparison_covered = True
+ by_str_comparison = "version"
+ comparison_str = comparison_tag
+ if comparison_target_version is not None:
+ # check if we had already covered comparison
+ if comparison_covered:
+ logging.error(
+ "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
+ )
+ exit(1)
+ comparison_covered = True
+ by_str_comparison = "target+version"
+ comparison_str = comparison_target_version
+
+ if comparison_target_branch is not None:
+ # check if we had already covered comparison
+ if comparison_covered:
+ logging.error(
+ "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
+ )
+ exit(1)
+ comparison_covered = True
+ by_str_comparison = "target+branch"
+ comparison_str = comparison_target_branch
+
+ if comparison_hash is not None:
+ # check if we had already covered comparison
+ # if comparison_covered:
+ # logging.error(
+ # "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
+ # )
+ # exit(1)
+ comparison_covered = True
+ by_str_comparison = "hash"
+ comparison_str = comparison_hash
+
+ if baseline_covered is False:
+ logging.error(
+ "You need to provider either "
+ + "( --baseline-branch, --baseline-tag, --baseline-hash, --baseline-target-branch or --baseline-target-version ) "
+ )
+ exit(1)
+ if comparison_covered is False:
+ logging.error(
+ "You need to provider either "
+ + "( --comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch or --comparison-target-version ) "
+ )
+ exit(1)
+ return baseline_str, by_str_baseline, comparison_str, by_str_comparison
+
+
+def process_single_test_comparison(
+ test_name,
+ tests_with_config,
+ original_metric_mode,
+ baseline_str,
+ comparison_str,
+ by_str_baseline,
+ by_str_comparison,
+ metric_name,
+ test_filter,
+ baseline_github_repo,
+ comparison_github_repo,
+ tf_triggering_env_baseline,
+ tf_triggering_env_comparison,
+ extra_filters,
+ baseline_deployment_name,
+ comparison_deployment_name,
+ baseline_github_org,
+ comparison_github_org,
+ running_platform_baseline,
+ running_platform_comparison,
+ rts,
+ from_ts_ms,
+ to_ts_ms,
+ last_n_baseline,
+ last_n_comparison,
+ verbose,
+ regressions_percent_lower_limit,
+ simplify_table,
+ regression_str,
+ improvement_str,
+ progress,
+):
+ """
+ Process comparison analysis for a single test.
+
+ Returns a dictionary containing all the results and side effects that need to be
+ accumulated by the caller.
+ """
+ tested_groups = []
+ tested_commands = []
+ if test_name in tests_with_config:
+ test_spec = tests_with_config[test_name]
+ if "tested-groups" in test_spec:
+ tested_groups = test_spec["tested-groups"]
+ if "tested-commands" in test_spec:
+ tested_commands = test_spec["tested-commands"]
+ else:
+ logging.error(f"Test does not contain spec info: {test_name}")
+
+ metric_mode = original_metric_mode
+ compare_version = "main"
+ # GE
+ github_link = "https://github.com/redis/redis-benchmarks-specification/blob"
+ test_path = f"redis_benchmarks_specification/test-suites/{test_name}.yml"
+ test_link = f"[{test_name}]({github_link}/{compare_version}/{test_path})"
+ multi_value_baseline = check_multi_value_filter(baseline_str)
+ multi_value_comparison = check_multi_value_filter(comparison_str)
+
+ filters_baseline = [
+ "metric={}".format(metric_name),
+ "{}={}".format(test_filter, test_name),
+ "github_repo={}".format(baseline_github_repo),
+ "triggering_env={}".format(tf_triggering_env_baseline),
+ ]
+ if extra_filters != "":
+ filters_baseline.append(extra_filters)
+ if baseline_str != "":
+ filters_baseline.append("{}={}".format(by_str_baseline, baseline_str))
+ if baseline_deployment_name != "":
+ filters_baseline.append("deployment_name={}".format(baseline_deployment_name))
+ if baseline_github_org != "":
+ filters_baseline.append(f"github_org={baseline_github_org}")
+ if running_platform_baseline is not None and running_platform_baseline != "":
+ filters_baseline.append("running_platform={}".format(running_platform_baseline))
+ filters_comparison = [
+ "metric={}".format(metric_name),
+ "{}={}".format(test_filter, test_name),
+ "github_repo={}".format(comparison_github_repo),
+ "triggering_env={}".format(tf_triggering_env_comparison),
+ ]
+ if comparison_str != "":
+ filters_comparison.append("{}={}".format(by_str_comparison, comparison_str))
+ if comparison_deployment_name != "":
+ filters_comparison.append(
+ "deployment_name={}".format(comparison_deployment_name)
+ )
+ if extra_filters != "":
+ filters_comparison.append(extra_filters)
+ if comparison_github_org != "":
+ filters_comparison.append(f"github_org={comparison_github_org}")
+ if "hash" not in by_str_baseline:
+ filters_baseline.append("hash==")
+ if "hash" not in by_str_comparison:
+ filters_comparison.append("hash==")
+ if running_platform_comparison is not None and running_platform_comparison != "":
+ filters_comparison.append(
+ "running_platform={}".format(running_platform_comparison)
+ )
+ baseline_timeseries = rts.ts().queryindex(filters_baseline)
+ comparison_timeseries = rts.ts().queryindex(filters_comparison)
+
+ # avoiding target time-series
+ comparison_timeseries = [x for x in comparison_timeseries if "target" not in x]
+ baseline_timeseries = [x for x in baseline_timeseries if "target" not in x]
+ progress.update()
+ if verbose:
+ logging.info(
+ "Baseline timeseries for {}: {}. test={}".format(
+ baseline_str, len(baseline_timeseries), test_name
+ )
+ )
+ logging.info(
+ "Comparison timeseries for {}: {}. test={}".format(
+ comparison_str, len(comparison_timeseries), test_name
+ )
+ )
+ if len(baseline_timeseries) > 1 and multi_value_baseline is False:
+ baseline_timeseries = get_only_Totals(baseline_timeseries)
+
+ # Initialize result dictionary
+ result = {
+ "skip_test": False,
+ "no_datapoints_baseline": False,
+ "no_datapoints_comparison": False,
+ "no_datapoints_both": False,
+ "baseline_only": False,
+ "comparison_only": False,
+ "detected_regression": False,
+ "detected_improvement": False,
+ "unstable": False,
+ "should_add_line": False,
+ "line": None,
+ "percentage_change": 0.0,
+ "tested_groups": tested_groups,
+ "tested_commands": tested_commands,
+ "boxplot_data": None,
+ }
+
+ if len(baseline_timeseries) == 0:
+ logging.warning(
+ f"No datapoints for test={test_name} for baseline timeseries {baseline_timeseries}"
+ )
+ result["no_datapoints_baseline"] = True
+ result["no_datapoints_both"] = True
+
+ if len(comparison_timeseries) == 0:
+ logging.warning(
+ f"No datapoints for test={test_name} for comparison timeseries {comparison_timeseries}"
+ )
+ result["no_datapoints_comparison"] = True
+ result["no_datapoints_both"] = True
+
+ if len(baseline_timeseries) != 1 and multi_value_baseline is False:
+ if verbose:
+ logging.warning(
+ "Skipping this test given the value of timeseries !=1. Baseline timeseries {}".format(
+ len(baseline_timeseries)
+ )
+ )
+ if len(baseline_timeseries) > 1:
+ logging.warning(
+ "\t\tTime-series: {}".format(", ".join(baseline_timeseries))
+ )
+ result["skip_test"] = True
+ return result
+
+ if len(comparison_timeseries) > 1 and multi_value_comparison is False:
+ comparison_timeseries = get_only_Totals(comparison_timeseries)
+ if len(comparison_timeseries) != 1 and multi_value_comparison is False:
+ if verbose:
+ logging.warning(
+ "Comparison timeseries {}".format(len(comparison_timeseries))
+ )
+ result["skip_test"] = True
+ return result
+
+ baseline_v = "N/A"
+ comparison_v = "N/A"
+ baseline_values = []
+ baseline_datapoints = []
+ comparison_values = []
+ comparison_datapoints = []
+ percentage_change = 0.0
+ baseline_v_str = "N/A"
+ comparison_v_str = "N/A"
+ largest_variance = 0
+ baseline_pct_change = "N/A"
+ comparison_pct_change = "N/A"
+
+ note = ""
+ try:
+ for ts_name_baseline in baseline_timeseries:
+ datapoints_inner = rts.ts().revrange(ts_name_baseline, from_ts_ms, to_ts_ms)
+ baseline_datapoints.extend(datapoints_inner)
+ (
+ baseline_pct_change,
+ baseline_v,
+ largest_variance,
+ ) = get_v_pct_change_and_largest_var(
+ baseline_datapoints,
+ baseline_pct_change,
+ baseline_v,
+ baseline_values,
+ largest_variance,
+ last_n_baseline,
+ verbose,
+ )
+ for ts_name_comparison in comparison_timeseries:
+ datapoints_inner = rts.ts().revrange(
+ ts_name_comparison, from_ts_ms, to_ts_ms
+ )
+ comparison_datapoints.extend(datapoints_inner)
+
+ (
+ comparison_pct_change,
+ comparison_v,
+ largest_variance,
+ ) = get_v_pct_change_and_largest_var(
+ comparison_datapoints,
+ comparison_pct_change,
+ comparison_v,
+ comparison_values,
+ largest_variance,
+ last_n_comparison,
+ verbose,
+ )
+
+ waterline = regressions_percent_lower_limit
+ # if regressions_percent_lower_limit < largest_variance:
+ # note = "waterline={:.1f}%.".format(largest_variance)
+ # waterline = largest_variance
+
+ except redis.exceptions.ResponseError as e:
+ logging.error(
+ "Detected a redis.exceptions.ResponseError. {}".format(e.__str__())
+ )
+ pass
+ except ZeroDivisionError as e:
+ logging.error("Detected a ZeroDivisionError. {}".format(e.__str__()))
+ pass
+
+ unstable = False
+
+ if baseline_v != "N/A" and comparison_v == "N/A":
+ logging.warning(
+ f"Baseline contains datapoints but comparison not for test: {test_name}"
+ )
+ result["baseline_only"] = True
+ if comparison_v != "N/A" and baseline_v == "N/A":
+ logging.warning(
+ f"Comparison contains datapoints but baseline not for test: {test_name}"
+ )
+ result["comparison_only"] = True
+ if (
+ baseline_v != "N/A"
+ and comparison_pct_change != "N/A"
+ and comparison_v != "N/A"
+ and baseline_pct_change != "N/A"
+ ):
+ if comparison_pct_change > 10.0 or baseline_pct_change > 10.0:
+ note = "UNSTABLE (very high variance)"
+ unstable = True
+ result["unstable"] = True
+
+ baseline_v_str = prepare_value_str(
+ baseline_pct_change,
+ baseline_v,
+ baseline_values,
+ simplify_table,
+ metric_name,
+ )
+ comparison_v_str = prepare_value_str(
+ comparison_pct_change,
+ comparison_v,
+ comparison_values,
+ simplify_table,
+ metric_name,
+ )
- if baseline_target_version is not None:
- by_str_baseline = "target+version"
- if baseline_covered:
- baseline_by_arr.append(by_str_baseline)
- logging.error(get_by_error("baseline", baseline_by_arr))
- exit(1)
- baseline_covered = True
- baseline_str = baseline_target_version
+ if metric_mode == "higher-better":
+ percentage_change = (float(comparison_v) / float(baseline_v) - 1) * 100.0
+ else:
+ # lower-better
+ percentage_change = (
+ -(float(baseline_v) - float(comparison_v)) / float(baseline_v)
+ ) * 100.0
- if baseline_hash is not None:
- by_str_baseline = "hash"
- if baseline_covered:
- baseline_by_arr.append(by_str_baseline)
- logging.error(get_by_error("baseline", baseline_by_arr))
- exit(1)
- baseline_covered = True
- baseline_str = baseline_hash
- if baseline_target_branch is not None:
- by_str_baseline = "target+branch"
- if baseline_covered:
- baseline_by_arr.append(by_str_baseline)
- logging.error(get_by_error("baseline", baseline_by_arr))
- exit(1)
- baseline_covered = True
- baseline_str = baseline_target_branch
+ # Collect data for box plot
+ result["boxplot_data"] = (test_name, percentage_change)
+ else:
+ logging.warn(
+ f"Missing data for test {test_name}. baseline_v={baseline_v} (pct_change={baseline_pct_change}), comparison_v={comparison_v} (pct_change={comparison_pct_change}) "
+ )
- ################# COMPARISON BY ....
+ result["percentage_change"] = percentage_change
- if comparison_branch is not None:
- by_str_comparison = "branch"
- comparison_covered = True
- comparison_str = comparison_branch
+ if baseline_v != "N/A" or comparison_v != "N/A":
+ detected_regression = False
+ detected_improvement = False
+ noise_waterline = 3
- if comparison_tag is not None:
- # check if we had already covered comparison
- if comparison_covered:
- logging.error(
- "--comparison-branch and --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
- )
- exit(1)
- comparison_covered = True
- by_str_comparison = "version"
- comparison_str = comparison_tag
- if comparison_target_version is not None:
- # check if we had already covered comparison
- if comparison_covered:
- logging.error(
- "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
- )
- exit(1)
- comparison_covered = True
- by_str_comparison = "target+version"
- comparison_str = comparison_target_version
+ # For higher-better metrics: negative change = regression, positive change = improvement
+ # For lower-better metrics: positive change = regression, negative change = improvement
+ if metric_mode == "higher-better":
+ # Higher is better: negative change is bad (regression), positive change is good (improvement)
+ if percentage_change < 0.0:
+ if -waterline >= percentage_change:
+ detected_regression = True
+ note = note + f" {regression_str}"
+ elif percentage_change < -noise_waterline:
+ if simplify_table is False:
+ note = note + f" potential {regression_str}"
+ else:
+ if simplify_table is False:
+ note = note + " No Change"
- if comparison_target_branch is not None:
- # check if we had already covered comparison
- if comparison_covered:
- logging.error(
- "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
- )
- exit(1)
- comparison_covered = True
- by_str_comparison = "target+branch"
- comparison_str = comparison_target_branch
+ if percentage_change > 0.0:
+ if percentage_change > waterline:
+ detected_improvement = True
+ note = note + f" {improvement_str}"
+ elif percentage_change > noise_waterline:
+ if simplify_table is False:
+ note = note + f" potential {improvement_str}"
+ else:
+ if simplify_table is False:
+ note = note + " No Change"
+ else:
+ # Lower is better: positive change is bad (regression), negative change is good (improvement)
+ if percentage_change > 0.0:
+ if percentage_change >= waterline:
+ detected_regression = True
+ note = note + f" {regression_str}"
+ elif percentage_change > noise_waterline:
+ if simplify_table is False:
+ note = note + f" potential {regression_str}"
+ else:
+ if simplify_table is False:
+ note = note + " No Change"
- if comparison_hash is not None:
- # check if we had already covered comparison
- # if comparison_covered:
- # logging.error(
- # "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..."
- # )
- # exit(1)
- comparison_covered = True
- by_str_comparison = "hash"
- comparison_str = comparison_hash
+ if percentage_change < 0.0:
+ if -percentage_change > waterline:
+ detected_improvement = True
+ note = note + f" {improvement_str}"
+ elif -percentage_change > noise_waterline:
+ if simplify_table is False:
+ note = note + f" potential {improvement_str}"
+ else:
+ if simplify_table is False:
+ note = note + " No Change"
- if baseline_covered is False:
- logging.error(
- "You need to provider either "
- + "( --baseline-branch, --baseline-tag, --baseline-hash, --baseline-target-branch or --baseline-target-version ) "
+ result["detected_regression"] = detected_regression
+ result["detected_improvement"] = detected_improvement
+
+ line = get_line(
+ baseline_v_str,
+ comparison_v_str,
+ note,
+ percentage_change,
+ test_link,
)
- exit(1)
- if comparison_covered is False:
- logging.error(
- "You need to provider either "
- + "( --comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch or --comparison-target-version ) "
+ result["line"] = line
+ else:
+ logging.warning(
+ "There were no datapoints both for baseline and comparison for test: {test_name}"
)
- exit(1)
- return baseline_str, by_str_baseline, comparison_str, by_str_comparison
+ result["no_datapoints_both"] = True
+ return result
-def process_single_test_comparison(
+
+def process_single_env_comparison(
+ env_name,
test_name,
tests_with_config,
original_metric_mode,
@@ -1198,8 +1944,6 @@ def process_single_test_comparison(
tf_triggering_env_baseline,
tf_triggering_env_comparison,
extra_filters,
- baseline_deployment_name,
- comparison_deployment_name,
baseline_github_org,
comparison_github_org,
running_platform_baseline,
@@ -1217,21 +1961,31 @@ def process_single_test_comparison(
progress,
):
"""
- Process comparison analysis for a single test.
-
- Returns a dictionary containing all the results and side effects that need to be
- accumulated by the caller.
+ Process comparison analysis for a single environment for a specific test.
+ This is similar to process_single_test_comparison but focuses on environment comparison.
"""
- tested_groups = []
- tested_commands = []
- if test_name in tests_with_config:
- test_spec = tests_with_config[test_name]
- if "tested-groups" in test_spec:
- tested_groups = test_spec["tested-groups"]
- if "tested-commands" in test_spec:
- tested_commands = test_spec["tested-commands"]
- else:
- logging.error(f"Test does not contain spec info: {test_name}")
+ result = {
+ "baseline_only": False,
+ "comparison_only": False,
+ "unstable": False,
+ "detected_regression": False,
+ "detected_improvement": False,
+ "no_datapoints_both": False,
+ "percentage_change": "N/A",
+ "line": [],
+ "boxplot_data": None,
+ }
+
+ baseline_datapoints = []
+ comparison_datapoints = []
+ baseline_values = []
+ comparison_values = []
+ baseline_v = "N/A"
+ comparison_v = "N/A"
+ baseline_pct_change = "N/A"
+ comparison_pct_change = "N/A"
+ percentage_change = "N/A"
+ largest_variance = 0.0
metric_mode = original_metric_mode
compare_version = "main"
@@ -1242,136 +1996,68 @@ def process_single_test_comparison(
multi_value_baseline = check_multi_value_filter(baseline_str)
multi_value_comparison = check_multi_value_filter(comparison_str)
+ # Build filters for baseline environment
filters_baseline = [
"metric={}".format(metric_name),
"{}={}".format(test_filter, test_name),
"github_repo={}".format(baseline_github_repo),
"triggering_env={}".format(tf_triggering_env_baseline),
+ "deployment_name={}".format(env_name), # Use env_name as deployment_name
]
if extra_filters != "":
filters_baseline.append(extra_filters)
if baseline_str != "":
filters_baseline.append("{}={}".format(by_str_baseline, baseline_str))
- if baseline_deployment_name != "":
- filters_baseline.append("deployment_name={}".format(baseline_deployment_name))
if baseline_github_org != "":
filters_baseline.append(f"github_org={baseline_github_org}")
if running_platform_baseline is not None and running_platform_baseline != "":
filters_baseline.append("running_platform={}".format(running_platform_baseline))
+
+ # Build filters for comparison environment (same environment, different branch/version)
filters_comparison = [
"metric={}".format(metric_name),
"{}={}".format(test_filter, test_name),
"github_repo={}".format(comparison_github_repo),
"triggering_env={}".format(tf_triggering_env_comparison),
+ "deployment_name={}".format(env_name), # Use same env_name
]
if comparison_str != "":
filters_comparison.append("{}={}".format(by_str_comparison, comparison_str))
- if comparison_deployment_name != "":
- filters_comparison.append(
- "deployment_name={}".format(comparison_deployment_name)
- )
if extra_filters != "":
filters_comparison.append(extra_filters)
if comparison_github_org != "":
- filters_comparison.append(f"github_org={comparison_github_org}")
- if "hash" not in by_str_baseline:
- filters_baseline.append("hash==")
- if "hash" not in by_str_comparison:
- filters_comparison.append("hash==")
- if running_platform_comparison is not None and running_platform_comparison != "":
- filters_comparison.append(
- "running_platform={}".format(running_platform_comparison)
- )
- baseline_timeseries = rts.ts().queryindex(filters_baseline)
- comparison_timeseries = rts.ts().queryindex(filters_comparison)
-
- # avoiding target time-series
- comparison_timeseries = [x for x in comparison_timeseries if "target" not in x]
- baseline_timeseries = [x for x in baseline_timeseries if "target" not in x]
- progress.update()
- if verbose:
- logging.info(
- "Baseline timeseries for {}: {}. test={}".format(
- baseline_str, len(baseline_timeseries), test_name
- )
- )
- logging.info(
- "Comparison timeseries for {}: {}. test={}".format(
- comparison_str, len(comparison_timeseries), test_name
- )
- )
- if len(baseline_timeseries) > 1 and multi_value_baseline is False:
- baseline_timeseries = get_only_Totals(baseline_timeseries)
-
- # Initialize result dictionary
- result = {
- "skip_test": False,
- "no_datapoints_baseline": False,
- "no_datapoints_comparison": False,
- "no_datapoints_both": False,
- "baseline_only": False,
- "comparison_only": False,
- "detected_regression": False,
- "detected_improvement": False,
- "unstable": False,
- "should_add_line": False,
- "line": None,
- "percentage_change": 0.0,
- "tested_groups": tested_groups,
- "tested_commands": tested_commands,
- "boxplot_data": None,
- }
-
- if len(baseline_timeseries) == 0:
- logging.warning(
- f"No datapoints for test={test_name} for baseline timeseries {baseline_timeseries}"
+ filters_comparison.append(f"github_org={comparison_github_org}")
+ if "hash" not in by_str_baseline:
+ filters_baseline.append("hash==")
+ if "hash" not in by_str_comparison:
+ filters_comparison.append("hash==")
+ if running_platform_comparison is not None and running_platform_comparison != "":
+ filters_comparison.append(
+ "running_platform={}".format(running_platform_comparison)
)
- result["no_datapoints_baseline"] = True
- result["no_datapoints_both"] = True
- if len(comparison_timeseries) == 0:
- logging.warning(
- f"No datapoints for test={test_name} for comparison timeseries {comparison_timeseries}"
- )
- result["no_datapoints_comparison"] = True
- result["no_datapoints_both"] = True
+ baseline_timeseries = rts.ts().queryindex(filters_baseline)
+ comparison_timeseries = rts.ts().queryindex(filters_comparison)
- if len(baseline_timeseries) != 1 and multi_value_baseline is False:
- if verbose:
- logging.warning(
- "Skipping this test given the value of timeseries !=1. Baseline timeseries {}".format(
- len(baseline_timeseries)
- )
+ # avoiding target time-series
+ comparison_timeseries = [x for x in comparison_timeseries if "target" not in x]
+ baseline_timeseries = [x for x in baseline_timeseries if "target" not in x]
+ progress.update()
+ if verbose:
+ logging.info(
+ "Baseline timeseries for {} (env: {}): {}. test={}".format(
+ baseline_str, env_name, len(baseline_timeseries), test_name
)
- if len(baseline_timeseries) > 1:
- logging.warning(
- "\t\tTime-series: {}".format(", ".join(baseline_timeseries))
- )
- result["skip_test"] = True
- return result
-
+ )
+ logging.info(
+ "Comparison timeseries for {} (env: {}): {}. test={}".format(
+ comparison_str, env_name, len(comparison_timeseries), test_name
+ )
+ )
+ if len(baseline_timeseries) > 1 and multi_value_baseline is False:
+ baseline_timeseries = get_only_Totals(baseline_timeseries)
if len(comparison_timeseries) > 1 and multi_value_comparison is False:
comparison_timeseries = get_only_Totals(comparison_timeseries)
- if len(comparison_timeseries) != 1 and multi_value_comparison is False:
- if verbose:
- logging.warning(
- "Comparison timeseries {}".format(len(comparison_timeseries))
- )
- result["skip_test"] = True
- return result
-
- baseline_v = "N/A"
- comparison_v = "N/A"
- baseline_values = []
- baseline_datapoints = []
- comparison_values = []
- comparison_datapoints = []
- percentage_change = 0.0
- baseline_v_str = "N/A"
- comparison_v_str = "N/A"
- largest_variance = 0
- baseline_pct_change = "N/A"
- comparison_pct_change = "N/A"
note = ""
try:
@@ -1412,9 +2098,6 @@ def process_single_test_comparison(
)
waterline = regressions_percent_lower_limit
- # if regressions_percent_lower_limit < largest_variance:
- # note = "waterline={:.1f}%.".format(largest_variance)
- # waterline = largest_variance
except redis.exceptions.ResponseError as e:
logging.error(
@@ -1429,12 +2112,12 @@ def process_single_test_comparison(
if baseline_v != "N/A" and comparison_v == "N/A":
logging.warning(
- f"Baseline contains datapoints but comparison not for test: {test_name}"
+ f"Baseline contains datapoints but comparison not for env: {env_name}, test: {test_name}"
)
result["baseline_only"] = True
if comparison_v != "N/A" and baseline_v == "N/A":
logging.warning(
- f"Comparison contains datapoints but baseline not for test: {test_name}"
+ f"Comparison contains datapoints but baseline not for env: {env_name}, test: {test_name}"
)
result["comparison_only"] = True
if (
@@ -1472,67 +2155,45 @@ def process_single_test_comparison(
) * 100.0
# Collect data for box plot
- result["boxplot_data"] = (test_name, percentage_change)
+ result["boxplot_data"] = (env_name, percentage_change)
else:
logging.warn(
- f"Missing data for test {test_name}. baseline_v={baseline_v} (pct_change={baseline_pct_change}), comparison_v={comparison_v} (pct_change={comparison_pct_change}) "
+ f"Missing data for env {env_name}, test {test_name}. baseline_v={baseline_v} (pct_change={baseline_pct_change}), comparison_v={comparison_v} (pct_change={comparison_pct_change}) "
)
result["percentage_change"] = percentage_change
- if baseline_v != "N/A" or comparison_v != "N/A":
+ if (
+ baseline_v != "N/A"
+ and comparison_v != "N/A"
+ and percentage_change != "N/A"
+ and unstable is False
+ ):
detected_regression = False
detected_improvement = False
- noise_waterline = 3
-
- # For higher-better metrics: negative change = regression, positive change = improvement
- # For lower-better metrics: positive change = regression, negative change = improvement
- if metric_mode == "higher-better":
- # Higher is better: negative change is bad (regression), positive change is good (improvement)
- if percentage_change < 0.0:
- if -waterline >= percentage_change:
- detected_regression = True
- note = note + f" {regression_str}"
- elif percentage_change < -noise_waterline:
- if simplify_table is False:
- note = note + f" potential {regression_str}"
- else:
- if simplify_table is False:
- note = note + " No Change"
-
- if percentage_change > 0.0:
- if percentage_change > waterline:
- detected_improvement = True
- note = note + f" {improvement_str}"
- elif percentage_change > noise_waterline:
- if simplify_table is False:
- note = note + f" potential {improvement_str}"
- else:
- if simplify_table is False:
- note = note + " No Change"
- else:
- # Lower is better: positive change is bad (regression), negative change is good (improvement)
- if percentage_change > 0.0:
- if percentage_change >= waterline:
- detected_regression = True
- note = note + f" {regression_str}"
- elif percentage_change > noise_waterline:
- if simplify_table is False:
- note = note + f" potential {regression_str}"
- else:
- if simplify_table is False:
- note = note + " No Change"
-
- if percentage_change < 0.0:
- if -percentage_change > waterline:
- detected_improvement = True
- note = note + f" {improvement_str}"
- elif -percentage_change > noise_waterline:
- if simplify_table is False:
- note = note + f" potential {improvement_str}"
- else:
- if simplify_table is False:
- note = note + " No Change"
+ noise_waterline = 1.0
+
+ if percentage_change > 0.0:
+ if percentage_change > waterline:
+ detected_regression = True
+ note = note + f" {regression_str}"
+ elif percentage_change > noise_waterline:
+ if simplify_table is False:
+ note = note + f" potential {regression_str}"
+ else:
+ if simplify_table is False:
+ note = note + " No Change"
+
+ if percentage_change < 0.0:
+ if -percentage_change > waterline:
+ detected_improvement = True
+ note = note + f" {improvement_str}"
+ elif -percentage_change > noise_waterline:
+ if simplify_table is False:
+ note = note + f" potential {improvement_str}"
+ else:
+ if simplify_table is False:
+ note = note + " No Change"
result["detected_regression"] = detected_regression
result["detected_improvement"] = detected_improvement
@@ -1542,18 +2203,201 @@ def process_single_test_comparison(
comparison_v_str,
note,
percentage_change,
- test_link,
+ env_name, # Use env_name instead of test_link for environment comparison
)
result["line"] = line
else:
logging.warning(
- "There were no datapoints both for baseline and comparison for test: {test_name}"
+ f"There were no datapoints both for baseline and comparison for env: {env_name}, test: {test_name}"
)
result["no_datapoints_both"] = True
return result
+def from_rts_to_env_comparison_table(
+ test_name,
+ env_list,
+ baseline_str,
+ comparison_str,
+ by_str_baseline,
+ by_str_comparison,
+ from_ts_ms,
+ to_ts_ms,
+ last_n_baseline,
+ last_n_comparison,
+ metric_mode,
+ metric_name,
+ print_improvements_only,
+ print_regressions_only,
+ skip_unstable,
+ regressions_percent_lower_limit,
+ rts,
+ simplify_table,
+ test_filter,
+ tf_triggering_env_baseline,
+ tf_triggering_env_comparison,
+ verbose,
+ running_platform_baseline=None,
+ running_platform_comparison=None,
+ baseline_github_repo="redis",
+ comparison_github_repo="redis",
+ baseline_github_org="redis",
+ comparison_github_org="redis",
+ regression_str="REGRESSION",
+ improvement_str="IMPROVEMENT",
+ tests_with_config={},
+ extra_filters="",
+):
+ """
+ Compare environments for a single test instead of tests for a single environment.
+ Returns comparison data organized by environment.
+ """
+ original_metric_mode = metric_mode
+
+ # Initialize result containers
+ table_full = []
+ table_stable = []
+ table_unstable = []
+ table_improvements = []
+ table_regressions = []
+ regressions_list = []
+ improvements_list = []
+ unstable_list = []
+ baseline_only_list = []
+ comparison_only_list = []
+ no_datapoints_list = []
+ boxplot_data = []
+
+ total_improvements = 0
+ total_regressions = 0
+ total_stable = 0
+ total_unstable = 0
+ total_comparison_points = 0
+ detected_regressions = False
+
+ # Progress tracking
+ from tqdm import tqdm
+
+ progress = tqdm(total=len(env_list), desc=f"Processing envs for {test_name[:30]}")
+
+ def process_env_wrapper(env_name):
+ """Wrapper function to process a single environment and return env_name with result"""
+ result = process_single_env_comparison(
+ env_name,
+ test_name,
+ tests_with_config,
+ original_metric_mode,
+ baseline_str,
+ comparison_str,
+ by_str_baseline,
+ by_str_comparison,
+ metric_name,
+ test_filter,
+ baseline_github_repo,
+ comparison_github_repo,
+ tf_triggering_env_baseline,
+ tf_triggering_env_comparison,
+ extra_filters,
+ baseline_github_org,
+ comparison_github_org,
+ running_platform_baseline,
+ running_platform_comparison,
+ rts,
+ from_ts_ms,
+ to_ts_ms,
+ last_n_baseline,
+ last_n_comparison,
+ verbose,
+ regressions_percent_lower_limit,
+ simplify_table,
+ regression_str,
+ improvement_str,
+ progress,
+ )
+ return env_name, result
+
+ # Process all environments
+ for env_name in env_list:
+ env_name, result = process_env_wrapper(env_name)
+
+ if result["no_datapoints_both"]:
+ no_datapoints_list.append(env_name)
+ continue
+
+ if result["baseline_only"]:
+ baseline_only_list.append(env_name)
+ continue
+
+ if result["comparison_only"]:
+ comparison_only_list.append(env_name)
+ continue
+
+ if result["unstable"]:
+ unstable_list.append((env_name, result["line"]))
+ table_unstable.append(result["line"])
+ total_unstable += 1
+ if skip_unstable is False:
+ table_full.append(result["line"])
+ total_comparison_points += 1
+ continue
+
+ if result["detected_regression"]:
+ detected_regressions = True
+ regressions_list.append((env_name, result["percentage_change"]))
+ table_regressions.append(result["line"])
+ total_regressions += 1
+
+ if result["detected_improvement"]:
+ improvements_list.append((env_name, result["percentage_change"]))
+ table_improvements.append(result["line"])
+ total_improvements += 1
+
+ if (
+ not result["detected_regression"]
+ and not result["detected_improvement"]
+ and not result["unstable"]
+ ):
+ total_stable += 1
+ table_stable.append(result["line"])
+
+ # Add to full table if not filtered out
+ if not (
+ print_improvements_only and not result["detected_improvement"]
+ ) and not (print_regressions_only and not result["detected_regression"]):
+ table_full.append(result["line"])
+ total_comparison_points += 1
+
+ # Collect boxplot data
+ if result["boxplot_data"]:
+ boxplot_data.append(result["boxplot_data"])
+
+ progress.close()
+
+ return (
+ detected_regressions,
+ table_full,
+ table_stable,
+ table_unstable,
+ table_improvements,
+ table_regressions,
+ total_improvements,
+ total_regressions,
+ total_stable,
+ total_unstable,
+ total_comparison_points,
+ regressions_list,
+ improvements_list,
+ unstable_list,
+ baseline_only_list,
+ comparison_only_list,
+ no_datapoints_list,
+ False, # group_change (not applicable for env comparison)
+ False, # command_change (not applicable for env comparison)
+ boxplot_data,
+ )
+
+
def from_rts_to_regression_table(
baseline_deployment_name,
comparison_deployment_name,
@@ -1795,6 +2639,8 @@ def get_only_Totals(baseline_timeseries):
)
new_base = []
for ts_name in baseline_timeseries:
+ if "8.5.0" in ts_name:
+ continue
if "io-threads" in ts_name:
continue
if "oss-cluster" in ts_name: