diff --git a/redis_benchmarks_specification/__compare__/args.py b/redis_benchmarks_specification/__compare__/args.py index 5e014b9..c47f3ac 100644 --- a/redis_benchmarks_specification/__compare__/args.py +++ b/redis_benchmarks_specification/__compare__/args.py @@ -230,4 +230,11 @@ def create_compare_arguments(parser): action="store_true", help="Skip interactive approval of changes to github before applying.", ) + parser.add_argument( + "--compare-by-env", + required=False, + default=False, + action="store_true", + help="Compare by environments for each test instead of comparing by tests for each environment (default behavior).", + ) return parser diff --git a/redis_benchmarks_specification/__compare__/compare.py b/redis_benchmarks_specification/__compare__/compare.py index ab7c7b3..1a1bd30 100644 --- a/redis_benchmarks_specification/__compare__/compare.py +++ b/redis_benchmarks_specification/__compare__/compare.py @@ -367,66 +367,140 @@ def compare_command_logic(args, project_name, project_version): ) grafana_link_base = "https://benchmarksredisio.grafana.net/d/1fWbtb7nz/experimental-oss-spec-benchmarks" - ( - detected_regressions, - table_output, - improvements_list, - regressions_list, - total_stable, - total_unstable, - total_comparison_points, - boxplot_data, - command_change, - ) = compute_regression_table( - rts, - tf_github_org, - tf_github_repo, - triggering_env_baseline, - triggering_env_comparison, - metric_name, - comparison_branch, - baseline_branch, - baseline_tag, - comparison_tag, - baseline_deployment_name, - comparison_deployment_name, - print_improvements_only, - print_regressions_only, - skip_unstable, - regressions_percent_lower_limit, - simplify_table, - test, - testname_regex, - verbose, - last_n_baseline, - last_n_comparison, - metric_mode, - from_date, - from_ts_ms, - to_date, - to_ts_ms, - use_metric_context_path, - running_platform_baseline, - running_platform_comparison, - baseline_target_version, - comparison_target_version, - baseline_hash, - comparison_hash, - baseline_github_repo, - comparison_github_repo, - baseline_target_branch, - comparison_target_branch, - baseline_github_org, - comparison_github_org, - args.regression_str, - args.improvement_str, - tests_with_config, - args.use_test_suites_folder, - testsuites_folder, - args.extra_filters, - getattr(args, "command_group_regex", ".*"), - getattr(args, "command_regex", ".*"), - ) + # Check if environment comparison mode is enabled + compare_by_env = getattr(args, "compare_by_env", False) + + if compare_by_env: + logging.info("Environment comparison mode enabled") + # Extract environment list from deployment names in the user's example + env_list = [ + "oss-standalone", + "oss-standalone-02-io-threads", + "oss-standalone-04-io-threads", + "oss-standalone-08-io-threads", + ] + + ( + detected_regressions, + table_output, + improvements_list, + regressions_list, + total_stable, + total_unstable, + total_comparison_points, + boxplot_data, + command_change, + ) = compute_env_comparison_table( + rts, + tf_github_org, + tf_github_repo, + triggering_env_baseline, + triggering_env_comparison, + metric_name, + comparison_branch, + baseline_branch, + baseline_tag, + comparison_tag, + env_list, + print_improvements_only, + print_regressions_only, + skip_unstable, + regressions_percent_lower_limit, + simplify_table, + test, + testname_regex, + verbose, + last_n_baseline, + last_n_comparison, + metric_mode, + from_date, + from_ts_ms, + to_date, + to_ts_ms, + use_metric_context_path, + running_platform_baseline, + running_platform_comparison, + baseline_target_version, + comparison_target_version, + baseline_hash, + comparison_hash, + baseline_github_repo, + comparison_github_repo, + baseline_target_branch, + comparison_target_branch, + baseline_github_org, + comparison_github_org, + args.regression_str, + args.improvement_str, + tests_with_config, + args.use_test_suites_folder, + testsuites_folder, + args.extra_filters, + getattr(args, "command_group_regex", ".*"), + getattr(args, "command_regex", ".*"), + ) + else: + logging.info("Default test comparison mode") + ( + detected_regressions, + table_output, + improvements_list, + regressions_list, + total_stable, + total_unstable, + total_comparison_points, + boxplot_data, + command_change, + ) = compute_regression_table( + rts, + tf_github_org, + tf_github_repo, + triggering_env_baseline, + triggering_env_comparison, + metric_name, + comparison_branch, + baseline_branch, + baseline_tag, + comparison_tag, + baseline_deployment_name, + comparison_deployment_name, + print_improvements_only, + print_regressions_only, + skip_unstable, + regressions_percent_lower_limit, + simplify_table, + test, + testname_regex, + verbose, + last_n_baseline, + last_n_comparison, + metric_mode, + from_date, + from_ts_ms, + to_date, + to_ts_ms, + use_metric_context_path, + running_platform_baseline, + running_platform_comparison, + baseline_target_version, + comparison_target_version, + baseline_hash, + comparison_hash, + baseline_github_repo, + comparison_github_repo, + baseline_target_branch, + comparison_target_branch, + baseline_github_org, + comparison_github_org, + args.regression_str, + args.improvement_str, + tests_with_config, + args.use_test_suites_folder, + testsuites_folder, + args.extra_filters, + getattr(args, "command_group_regex", ".*"), + getattr(args, "command_regex", ".*"), + ) total_regressions = len(regressions_list) total_improvements = len(improvements_list) prepare_regression_comment( @@ -696,7 +770,7 @@ def extract_default_branch_and_metric(defaults_filename): return default_baseline_branch, default_metrics_str -def compute_regression_table( +def compute_env_comparison_table( rts, tf_github_org, tf_github_repo, @@ -707,8 +781,7 @@ def compute_regression_table( baseline_branch="unstable", baseline_tag=None, comparison_tag=None, - baseline_deployment_name="oss-standalone", - comparison_deployment_name="oss-standalone", + env_list=None, print_improvements_only=False, print_regressions_only=False, skip_unstable=False, @@ -746,6 +819,9 @@ def compute_regression_table( command_group_regex=".*", command_regex=".*", ): + """ + Compute environment comparison table for a specific test across different environments. + """ START_TIME_NOW_UTC, _, _ = get_start_time_vars() START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31) if from_date is None: @@ -756,13 +832,38 @@ def compute_regression_table( from_ts_ms = int(from_date.timestamp() * 1000) if to_ts_ms is None: to_ts_ms = int(to_date.timestamp() * 1000) - from_human_str = humanize.naturaltime( - dt.datetime.utcfromtimestamp(from_ts_ms / 1000) - ) - to_human_str = humanize.naturaltime(dt.datetime.utcfromtimestamp(to_ts_ms / 1000)) - logging.info( - "Using a time-delta from {} to {}".format(from_human_str, to_human_str) - ) + + # Extract test names from the test parameter + test_names = [] + if test != "": + test_names = test.split(",") + + # If no specific test provided, we need at least one test for environment comparison + if not test_names: + logging.error( + "Environment comparison requires specifying at least one test via --test parameter" + ) + return None, "", [], [], 0, 0, 0, [], False + + # For environment comparison, we focus on the first test if multiple are provided + test_name = test_names[0] + if len(test_names) > 1: + logging.warning( + f"Environment comparison mode: using only the first test '{test_name}' from the provided list" + ) + + # Default environment list if not provided + if env_list is None: + env_list = [ + "oss-standalone", + "oss-standalone-02-io-threads", + "oss-standalone-04-io-threads", + "oss-standalone-08-io-threads", + ] + + logging.info(f"Comparing environments {env_list} for test '{test_name}'") + + # Build baseline and comparison strings baseline_str, by_str_baseline, comparison_str, by_str_comparison = get_by_strings( baseline_branch, comparison_branch, @@ -770,52 +871,13 @@ def compute_regression_table( comparison_tag, baseline_target_version, comparison_target_version, - comparison_hash, baseline_hash, + comparison_hash, baseline_target_branch, comparison_target_branch, ) - logging.info(f"Using baseline filter {by_str_baseline}={baseline_str}") - logging.info(f"Using comparison filter {by_str_comparison}={comparison_str}") - ( - prefix, - testcases_setname, - _, - tsname_project_total_failures, - tsname_project_total_success, - _, - _, - _, - testcases_metric_context_path_setname, - _, - _, - _, - _, - _, - ) = get_overall_dashboard_keynames( - tf_github_org, tf_github_repo, tf_triggering_env_baseline - ) - test_names = [] - used_key = testcases_setname - test_filter = "test_name" - if use_metric_context_path: - test_filter = "test_name:metric_context_path" - used_key = testcases_metric_context_path_setname - tags_regex_string = re.compile(testname_regex) - if test != "": - test_names = test.split(",") - logging.info("Using test name {}".format(test_names)) - elif use_test_suites_folder: - test_names = get_test_names_from_yaml_files( - test_suites_folder, tags_regex_string - ) - else: - test_names = get_test_names_from_db( - rts, tags_regex_string, test_names, used_key - ) - # Apply command regex filtering to tests_with_config - tests_with_config = filter_tests_by_command_regex(tests_with_config, command_regex) + test_filter = "test_name" ( detected_regressions, @@ -838,9 +900,9 @@ def compute_regression_table( group_change, command_change, boxplot_data, - ) = from_rts_to_regression_table( - baseline_deployment_name, - comparison_deployment_name, + ) = from_rts_to_env_comparison_table( + test_name, + env_list, baseline_str, comparison_str, by_str_baseline, @@ -858,7 +920,6 @@ def compute_regression_table( rts, simplify_table, test_filter, - test_names, tf_triggering_env_baseline, tf_triggering_env_comparison, verbose, @@ -873,64 +934,23 @@ def compute_regression_table( tests_with_config, extra_filters, ) - logging.info( - "Printing differential analysis between {} and {}".format( - baseline_str, comparison_str - ) - ) - - table_output = "# Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n\n".format( - baseline_str, - comparison_str, - from_human_str, - baseline_deployment_name, - ) - table_output += "
\n By GROUP change csv:\n\n" - table_output += ( - "\ncommand_group,min_change,q1_change,median_change,q3_change,max_change \n" - ) - for group_name, changes_list in group_change.items(): - min_change = min(changes_list) - q1_change = np.percentile(changes_list, 25) - median_change = np.median(changes_list) - q3_change = np.percentile(changes_list, 75) - max_change = max(changes_list) - table_output += f"{group_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n" - table_output += "\n
\n" - table_output += "\n\n" - table_output += "
\n By COMMAND change csv:\n\n" + # Generate table output + table_output = f"## Environment Comparison for Test: {test_name}\n\n" + table_output += f"**Metric:** {metric_name} ({metric_mode})\n\n" table_output += ( - "\ncommand,min_change,q1_change,median_change,q3_change,max_change \n" + f"**Baseline:** {baseline_github_org}/{baseline_github_repo} {baseline_str}\n\n" ) - - # Filter commands by command group regex if specified - filtered_command_change = command_change - if command_group_regex != ".*": - group_regex = re.compile(command_group_regex) - filtered_command_change = {} - for command_name, changes_list in command_change.items(): - command_group = categorize_command(command_name.lower()) - if re.search(group_regex, command_group): - filtered_command_change[command_name] = changes_list - - for command_name, changes_list in filtered_command_change.items(): - min_change = min(changes_list) - q1_change = np.percentile(changes_list, 25) - median_change = np.median(changes_list) - q3_change = np.percentile(changes_list, 75) - max_change = max(changes_list) - table_output += f"{command_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n" - table_output += "\n
\n" + table_output += f"**Comparison:** {comparison_github_org}/{comparison_github_repo} {comparison_str}\n\n" if total_unstable > 0: old_stdout = sys.stdout sys.stdout = mystdout = StringIO() - table_output += "#### Unstable Table\n\n" + table_output += "#### Unstable Environments\n\n" writer_regressions = MarkdownTableWriter( table_name="", headers=[ - "Test Case", + "Environment", f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", "% change ({})".format(metric_mode), @@ -941,19 +961,19 @@ def compute_regression_table( writer_regressions.dump(mystdout, False) table_output += mystdout.getvalue() table_output += "\n\n" - test_names_str = "|".join([l[0] for l in unstable_list]) - table_output += f"Unstable test regexp names: {test_names_str}\n\n" + env_names_str = "|".join([l[0] for l in unstable_list]) + table_output += f"Unstable environment names: {env_names_str}\n\n" mystdout.close() sys.stdout = old_stdout if total_regressions > 0: old_stdout = sys.stdout sys.stdout = mystdout = StringIO() - table_output += "#### Regressions Table\n\n" + table_output += "#### Regressions by Environment\n\n" writer_regressions = MarkdownTableWriter( table_name="", headers=[ - "Test Case", + "Environment", f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", "% change ({})".format(metric_mode), @@ -964,19 +984,19 @@ def compute_regression_table( writer_regressions.dump(mystdout, False) table_output += mystdout.getvalue() table_output += "\n\n" - test_names_str = "|".join([l[0] for l in regressions_list]) - table_output += f"Regressions test regexp names: {test_names_str}\n\n" + env_names_str = "|".join([l[0] for l in regressions_list]) + table_output += f"Regression environment names: {env_names_str}\n\n" mystdout.close() sys.stdout = old_stdout if total_improvements > 0: old_stdout = sys.stdout sys.stdout = mystdout = StringIO() - table_output += "#### Improvements Table\n\n" + table_output += "#### Improvements by Environment\n\n" writer_regressions = MarkdownTableWriter( table_name="", headers=[ - "Test Case", + "Environment", f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", "% change ({})".format(metric_mode), @@ -987,8 +1007,8 @@ def compute_regression_table( writer_regressions.dump(mystdout, False) table_output += mystdout.getvalue() table_output += "\n\n" - test_names_str = "|".join([l[0] for l in improvements_list]) - table_output += f"Improvements test regexp names: {test_names_str}\n\n" + env_names_str = "|".join([l[0] for l in improvements_list]) + table_output += f"Improvements environment names: {env_names_str}\n\n" mystdout.close() sys.stdout = old_stdout @@ -997,7 +1017,7 @@ def compute_regression_table( writer_full = MarkdownTableWriter( table_name="", headers=[ - "Test Case", + "Environment", f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", "% change ({})".format(metric_mode), @@ -1005,36 +1025,37 @@ def compute_regression_table( ], value_matrix=table_full, ) - table_output += "
\n Full Results table:\n\n" + table_output += f"
\n Full Environment Results for Test: {test_name}:\n\n" writer_full.dump(mystdout, False) sys.stdout = old_stdout table_output += mystdout.getvalue() table_output += "\n
\n" + len_baseline_only_list = len(baseline_only_list) if len_baseline_only_list > 0: - table_output += f"\n WARNING: There were {len_baseline_only_list} benchmarks with datapoints only on baseline.\n\n" - baseline_only_test_names_str = "|".join([l for l in baseline_only_list]) + table_output += f"\n WARNING: There were {len_baseline_only_list} environments with datapoints only on baseline.\n\n" + baseline_only_env_names_str = "|".join([l for l in baseline_only_list]) table_output += ( - f" Baseline only test regexp names: {baseline_only_test_names_str}\n\n" + f" Baseline only environment names: {baseline_only_env_names_str}\n\n" ) + len_comparison_only_list = len(comparison_only_list) if len_comparison_only_list > 0: - table_output += f"\n WARNING: There were {len_comparison_only_list} benchmarks with datapoints only on comparison.\n\n" - comparison_only_test_names_str = "|".join([l for l in comparison_only_list]) + table_output += f"\n WARNING: There were {len_comparison_only_list} environments with datapoints only on comparison.\n\n" + comparison_only_env_names_str = "|".join([l for l in comparison_only_list]) table_output += ( - f" Comparison only test regexp names: {comparison_only_test_names_str}\n\n" + f" Comparison only environment names: {comparison_only_env_names_str}\n\n" ) - len_no_datapoints = len(no_datapoints_list) - if len_no_datapoints > 0: - table_output += f"\n WARNING: There were {len_no_datapoints} benchmarks with NO datapoints for both baseline and comparison.\n\n" - table_output += "
\n NO datapoints for both baseline and comparison:\n\n" - no_datapoints_test_names_str = "|".join([l for l in no_datapoints_list]) + + len_no_datapoints_list = len(no_datapoints_list) + if len_no_datapoints_list > 0: + table_output += f"\n WARNING: There were {len_no_datapoints_list} environments with no datapoints.\n\n" + no_datapoints_env_names_str = "|".join([l for l in no_datapoints_list]) table_output += ( - f" NO DATAPOINTS test regexp names: {no_datapoints_test_names_str}\n\n" + f" No datapoints environment names: {no_datapoints_env_names_str}\n\n" ) - table_output += "\n
\n" return ( detected_regressions, @@ -1049,141 +1070,866 @@ def compute_regression_table( ) -def get_by_error(name, by_str_arr): - by_string = ",".join(by_str_arr) - return f"--{name}-branch, --{name}-tag, --{name}-target-branch, --{name}-hash, and --{name}-target-version are mutually exclusive. You selected a total of {len(by_str_arr)}: {by_string}. Pick one..." - - -def get_by_strings( - baseline_branch, +def compute_regression_table( + rts, + tf_github_org, + tf_github_repo, + tf_triggering_env_baseline, + tf_triggering_env_comparison, + metric_name, comparison_branch, - baseline_tag, - comparison_tag, + baseline_branch="unstable", + baseline_tag=None, + comparison_tag=None, + baseline_deployment_name="oss-standalone", + comparison_deployment_name="oss-standalone", + print_improvements_only=False, + print_regressions_only=False, + skip_unstable=False, + regressions_percent_lower_limit=5.0, + simplify_table=False, + test="", + testname_regex=".*", + verbose=False, + last_n_baseline=-1, + last_n_comparison=-1, + metric_mode="higher-better", + from_date=None, + from_ts_ms=None, + to_date=None, + to_ts_ms=None, + use_metric_context_path=None, + running_platform_baseline=None, + running_platform_comparison=None, baseline_target_version=None, comparison_target_version=None, - baseline_hash=None, comparison_hash=None, + baseline_hash=None, + baseline_github_repo="redis", + comparison_github_repo="redis", baseline_target_branch=None, comparison_target_branch=None, + baseline_github_org="redis", + comparison_github_org="redis", + regression_str="REGRESSION", + improvement_str="IMPROVEMENT", + tests_with_config={}, + use_test_suites_folder=False, + test_suites_folder=None, + extra_filters="", + command_group_regex=".*", + command_regex=".*", ): - baseline_covered = False - comparison_covered = False - by_str_baseline = "" - by_str_comparison = "" - baseline_str = "" - comparison_str = "" - baseline_by_arr = [] - comparison_by_arr = [] + START_TIME_NOW_UTC, _, _ = get_start_time_vars() + START_TIME_LAST_MONTH_UTC = START_TIME_NOW_UTC - datetime.timedelta(days=31) + if from_date is None: + from_date = START_TIME_LAST_MONTH_UTC + if to_date is None: + to_date = START_TIME_NOW_UTC + if from_ts_ms is None: + from_ts_ms = int(from_date.timestamp() * 1000) + if to_ts_ms is None: + to_ts_ms = int(to_date.timestamp() * 1000) + from_human_str = humanize.naturaltime( + dt.datetime.utcfromtimestamp(from_ts_ms / 1000) + ) + to_human_str = humanize.naturaltime(dt.datetime.utcfromtimestamp(to_ts_ms / 1000)) + logging.info( + "Using a time-delta from {} to {}".format(from_human_str, to_human_str) + ) + baseline_str, by_str_baseline, comparison_str, by_str_comparison = get_by_strings( + baseline_branch, + comparison_branch, + baseline_tag, + comparison_tag, + baseline_target_version, + comparison_target_version, + comparison_hash, + baseline_hash, + baseline_target_branch, + comparison_target_branch, + ) + logging.info(f"Using baseline filter {by_str_baseline}={baseline_str}") + logging.info(f"Using comparison filter {by_str_comparison}={comparison_str}") + ( + prefix, + testcases_setname, + _, + tsname_project_total_failures, + tsname_project_total_success, + _, + _, + _, + testcases_metric_context_path_setname, + _, + _, + _, + _, + _, + ) = get_overall_dashboard_keynames( + tf_github_org, tf_github_repo, tf_triggering_env_baseline + ) + test_names = [] + used_key = testcases_setname + test_filter = "test_name" + if use_metric_context_path: + test_filter = "test_name:metric_context_path" + used_key = testcases_metric_context_path_setname + tags_regex_string = re.compile(testname_regex) + if test != "": + test_names = test.split(",") + logging.info("Using test name {}".format(test_names)) + elif use_test_suites_folder: + test_names = get_test_names_from_yaml_files( + test_suites_folder, tags_regex_string + ) + else: + test_names = get_test_names_from_db( + rts, tags_regex_string, test_names, used_key + ) - ################# BASELINE BY .... + # Apply command regex filtering to tests_with_config + tests_with_config = filter_tests_by_command_regex(tests_with_config, command_regex) - if baseline_branch is not None: - by_str_baseline = "branch" - baseline_covered = True - baseline_str = baseline_branch - baseline_by_arr.append(by_str_baseline) + ( + detected_regressions, + table_full, + table_stable, + table_unstable, + table_improvements, + table_regressions, + total_improvements, + total_regressions, + total_stable, + total_unstable, + total_comparison_points, + regressions_list, + improvements_list, + unstable_list, + baseline_only_list, + comparison_only_list, + no_datapoints_list, + group_change, + command_change, + boxplot_data, + ) = from_rts_to_regression_table( + baseline_deployment_name, + comparison_deployment_name, + baseline_str, + comparison_str, + by_str_baseline, + by_str_comparison, + from_ts_ms, + to_ts_ms, + last_n_baseline, + last_n_comparison, + metric_mode, + metric_name, + print_improvements_only, + print_regressions_only, + skip_unstable, + regressions_percent_lower_limit, + rts, + simplify_table, + test_filter, + test_names, + tf_triggering_env_baseline, + tf_triggering_env_comparison, + verbose, + running_platform_baseline, + running_platform_comparison, + baseline_github_repo, + comparison_github_repo, + baseline_github_org, + comparison_github_org, + regression_str, + improvement_str, + tests_with_config, + extra_filters, + ) + logging.info( + "Printing differential analysis between {} and {}".format( + baseline_str, comparison_str + ) + ) - if baseline_tag is not None: - by_str_baseline = "version" - if baseline_covered: - baseline_by_arr.append(by_str_baseline) - logging.error(get_by_error("baseline", baseline_by_arr)) - exit(1) - baseline_covered = True - baseline_str = baseline_tag + table_output = "# Comparison between {} and {}.\n\nTime Period from {}. (environment used: {})\n\n".format( + baseline_str, + comparison_str, + from_human_str, + baseline_deployment_name, + ) + + table_output += "
\n By GROUP change csv:\n\n" + table_output += ( + "\ncommand_group,min_change,q1_change,median_change,q3_change,max_change \n" + ) + for group_name, changes_list in group_change.items(): + min_change = min(changes_list) + q1_change = np.percentile(changes_list, 25) + median_change = np.median(changes_list) + q3_change = np.percentile(changes_list, 75) + max_change = max(changes_list) + table_output += f"{group_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n" + table_output += "\n
\n" + table_output += "\n\n" + table_output += "
\n By COMMAND change csv:\n\n" + table_output += ( + "\ncommand,min_change,q1_change,median_change,q3_change,max_change \n" + ) + + # Filter commands by command group regex if specified + filtered_command_change = command_change + if command_group_regex != ".*": + group_regex = re.compile(command_group_regex) + filtered_command_change = {} + for command_name, changes_list in command_change.items(): + command_group = categorize_command(command_name.lower()) + if re.search(group_regex, command_group): + filtered_command_change[command_name] = changes_list + + for command_name, changes_list in filtered_command_change.items(): + min_change = min(changes_list) + q1_change = np.percentile(changes_list, 25) + median_change = np.median(changes_list) + q3_change = np.percentile(changes_list, 75) + max_change = max(changes_list) + table_output += f"{command_name},{min_change:.3f},{q1_change:.3f},{median_change:.3f},{q3_change:.3f},{max_change:.3f}\n" + table_output += "\n
\n" + + if total_unstable > 0: + old_stdout = sys.stdout + sys.stdout = mystdout = StringIO() + table_output += "#### Unstable Table\n\n" + writer_regressions = MarkdownTableWriter( + table_name="", + headers=[ + "Test Case", + f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", + f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", + "% change ({})".format(metric_mode), + "Note", + ], + value_matrix=table_unstable, + ) + writer_regressions.dump(mystdout, False) + table_output += mystdout.getvalue() + table_output += "\n\n" + test_names_str = "|".join([l[0] for l in unstable_list]) + table_output += f"Unstable test regexp names: {test_names_str}\n\n" + mystdout.close() + sys.stdout = old_stdout + + if total_regressions > 0: + old_stdout = sys.stdout + sys.stdout = mystdout = StringIO() + table_output += "#### Regressions Table\n\n" + writer_regressions = MarkdownTableWriter( + table_name="", + headers=[ + "Test Case", + f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", + f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", + "% change ({})".format(metric_mode), + "Note", + ], + value_matrix=table_regressions, + ) + writer_regressions.dump(mystdout, False) + table_output += mystdout.getvalue() + table_output += "\n\n" + test_names_str = "|".join([l[0] for l in regressions_list]) + table_output += f"Regressions test regexp names: {test_names_str}\n\n" + mystdout.close() + sys.stdout = old_stdout + + if total_improvements > 0: + old_stdout = sys.stdout + sys.stdout = mystdout = StringIO() + table_output += "#### Improvements Table\n\n" + writer_regressions = MarkdownTableWriter( + table_name="", + headers=[ + "Test Case", + f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", + f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", + "% change ({})".format(metric_mode), + "Note", + ], + value_matrix=table_improvements, + ) + writer_regressions.dump(mystdout, False) + table_output += mystdout.getvalue() + table_output += "\n\n" + test_names_str = "|".join([l[0] for l in improvements_list]) + table_output += f"Improvements test regexp names: {test_names_str}\n\n" + mystdout.close() + sys.stdout = old_stdout + + old_stdout = sys.stdout + sys.stdout = mystdout = StringIO() + writer_full = MarkdownTableWriter( + table_name="", + headers=[ + "Test Case", + f"Baseline {baseline_github_org}/{baseline_github_repo} {baseline_str} (median obs. +- std.dev)", + f"Comparison {comparison_github_org}/{comparison_github_repo} {comparison_str} (median obs. +- std.dev)", + "% change ({})".format(metric_mode), + "Note", + ], + value_matrix=table_full, + ) + table_output += "
\n Full Results table:\n\n" + + writer_full.dump(mystdout, False) + + sys.stdout = old_stdout + table_output += mystdout.getvalue() + table_output += "\n
\n" + len_baseline_only_list = len(baseline_only_list) + if len_baseline_only_list > 0: + table_output += f"\n WARNING: There were {len_baseline_only_list} benchmarks with datapoints only on baseline.\n\n" + baseline_only_test_names_str = "|".join([l for l in baseline_only_list]) + table_output += ( + f" Baseline only test regexp names: {baseline_only_test_names_str}\n\n" + ) + len_comparison_only_list = len(comparison_only_list) + if len_comparison_only_list > 0: + table_output += f"\n WARNING: There were {len_comparison_only_list} benchmarks with datapoints only on comparison.\n\n" + comparison_only_test_names_str = "|".join([l for l in comparison_only_list]) + table_output += ( + f" Comparison only test regexp names: {comparison_only_test_names_str}\n\n" + ) + len_no_datapoints = len(no_datapoints_list) + if len_no_datapoints > 0: + table_output += f"\n WARNING: There were {len_no_datapoints} benchmarks with NO datapoints for both baseline and comparison.\n\n" + table_output += "
\n NO datapoints for both baseline and comparison:\n\n" + no_datapoints_test_names_str = "|".join([l for l in no_datapoints_list]) + table_output += ( + f" NO DATAPOINTS test regexp names: {no_datapoints_test_names_str}\n\n" + ) + table_output += "\n
\n" + + return ( + detected_regressions, + table_output, + improvements_list, + regressions_list, + total_stable, + total_unstable, + total_comparison_points, + boxplot_data, + command_change, + ) + + +def get_by_error(name, by_str_arr): + by_string = ",".join(by_str_arr) + return f"--{name}-branch, --{name}-tag, --{name}-target-branch, --{name}-hash, and --{name}-target-version are mutually exclusive. You selected a total of {len(by_str_arr)}: {by_string}. Pick one..." + + +def get_by_strings( + baseline_branch, + comparison_branch, + baseline_tag, + comparison_tag, + baseline_target_version=None, + comparison_target_version=None, + baseline_hash=None, + comparison_hash=None, + baseline_target_branch=None, + comparison_target_branch=None, +): + baseline_covered = False + comparison_covered = False + by_str_baseline = "" + by_str_comparison = "" + baseline_str = "" + comparison_str = "" + baseline_by_arr = [] + comparison_by_arr = [] + + ################# BASELINE BY .... + + if baseline_branch is not None: + by_str_baseline = "branch" + baseline_covered = True + baseline_str = baseline_branch + baseline_by_arr.append(by_str_baseline) + + if baseline_tag is not None: + by_str_baseline = "version" + if baseline_covered: + baseline_by_arr.append(by_str_baseline) + logging.error(get_by_error("baseline", baseline_by_arr)) + exit(1) + baseline_covered = True + baseline_str = baseline_tag + + if baseline_target_version is not None: + by_str_baseline = "target+version" + if baseline_covered: + baseline_by_arr.append(by_str_baseline) + logging.error(get_by_error("baseline", baseline_by_arr)) + exit(1) + baseline_covered = True + baseline_str = baseline_target_version + + if baseline_hash is not None: + by_str_baseline = "hash" + if baseline_covered: + baseline_by_arr.append(by_str_baseline) + logging.error(get_by_error("baseline", baseline_by_arr)) + exit(1) + baseline_covered = True + baseline_str = baseline_hash + if baseline_target_branch is not None: + by_str_baseline = "target+branch" + if baseline_covered: + baseline_by_arr.append(by_str_baseline) + logging.error(get_by_error("baseline", baseline_by_arr)) + exit(1) + baseline_covered = True + baseline_str = baseline_target_branch + + ################# COMPARISON BY .... + + if comparison_branch is not None: + by_str_comparison = "branch" + comparison_covered = True + comparison_str = comparison_branch + + if comparison_tag is not None: + # check if we had already covered comparison + if comparison_covered: + logging.error( + "--comparison-branch and --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." + ) + exit(1) + comparison_covered = True + by_str_comparison = "version" + comparison_str = comparison_tag + if comparison_target_version is not None: + # check if we had already covered comparison + if comparison_covered: + logging.error( + "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." + ) + exit(1) + comparison_covered = True + by_str_comparison = "target+version" + comparison_str = comparison_target_version + + if comparison_target_branch is not None: + # check if we had already covered comparison + if comparison_covered: + logging.error( + "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." + ) + exit(1) + comparison_covered = True + by_str_comparison = "target+branch" + comparison_str = comparison_target_branch + + if comparison_hash is not None: + # check if we had already covered comparison + # if comparison_covered: + # logging.error( + # "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." + # ) + # exit(1) + comparison_covered = True + by_str_comparison = "hash" + comparison_str = comparison_hash + + if baseline_covered is False: + logging.error( + "You need to provider either " + + "( --baseline-branch, --baseline-tag, --baseline-hash, --baseline-target-branch or --baseline-target-version ) " + ) + exit(1) + if comparison_covered is False: + logging.error( + "You need to provider either " + + "( --comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch or --comparison-target-version ) " + ) + exit(1) + return baseline_str, by_str_baseline, comparison_str, by_str_comparison + + +def process_single_test_comparison( + test_name, + tests_with_config, + original_metric_mode, + baseline_str, + comparison_str, + by_str_baseline, + by_str_comparison, + metric_name, + test_filter, + baseline_github_repo, + comparison_github_repo, + tf_triggering_env_baseline, + tf_triggering_env_comparison, + extra_filters, + baseline_deployment_name, + comparison_deployment_name, + baseline_github_org, + comparison_github_org, + running_platform_baseline, + running_platform_comparison, + rts, + from_ts_ms, + to_ts_ms, + last_n_baseline, + last_n_comparison, + verbose, + regressions_percent_lower_limit, + simplify_table, + regression_str, + improvement_str, + progress, +): + """ + Process comparison analysis for a single test. + + Returns a dictionary containing all the results and side effects that need to be + accumulated by the caller. + """ + tested_groups = [] + tested_commands = [] + if test_name in tests_with_config: + test_spec = tests_with_config[test_name] + if "tested-groups" in test_spec: + tested_groups = test_spec["tested-groups"] + if "tested-commands" in test_spec: + tested_commands = test_spec["tested-commands"] + else: + logging.error(f"Test does not contain spec info: {test_name}") + + metric_mode = original_metric_mode + compare_version = "main" + # GE + github_link = "https://github.com/redis/redis-benchmarks-specification/blob" + test_path = f"redis_benchmarks_specification/test-suites/{test_name}.yml" + test_link = f"[{test_name}]({github_link}/{compare_version}/{test_path})" + multi_value_baseline = check_multi_value_filter(baseline_str) + multi_value_comparison = check_multi_value_filter(comparison_str) + + filters_baseline = [ + "metric={}".format(metric_name), + "{}={}".format(test_filter, test_name), + "github_repo={}".format(baseline_github_repo), + "triggering_env={}".format(tf_triggering_env_baseline), + ] + if extra_filters != "": + filters_baseline.append(extra_filters) + if baseline_str != "": + filters_baseline.append("{}={}".format(by_str_baseline, baseline_str)) + if baseline_deployment_name != "": + filters_baseline.append("deployment_name={}".format(baseline_deployment_name)) + if baseline_github_org != "": + filters_baseline.append(f"github_org={baseline_github_org}") + if running_platform_baseline is not None and running_platform_baseline != "": + filters_baseline.append("running_platform={}".format(running_platform_baseline)) + filters_comparison = [ + "metric={}".format(metric_name), + "{}={}".format(test_filter, test_name), + "github_repo={}".format(comparison_github_repo), + "triggering_env={}".format(tf_triggering_env_comparison), + ] + if comparison_str != "": + filters_comparison.append("{}={}".format(by_str_comparison, comparison_str)) + if comparison_deployment_name != "": + filters_comparison.append( + "deployment_name={}".format(comparison_deployment_name) + ) + if extra_filters != "": + filters_comparison.append(extra_filters) + if comparison_github_org != "": + filters_comparison.append(f"github_org={comparison_github_org}") + if "hash" not in by_str_baseline: + filters_baseline.append("hash==") + if "hash" not in by_str_comparison: + filters_comparison.append("hash==") + if running_platform_comparison is not None and running_platform_comparison != "": + filters_comparison.append( + "running_platform={}".format(running_platform_comparison) + ) + baseline_timeseries = rts.ts().queryindex(filters_baseline) + comparison_timeseries = rts.ts().queryindex(filters_comparison) + + # avoiding target time-series + comparison_timeseries = [x for x in comparison_timeseries if "target" not in x] + baseline_timeseries = [x for x in baseline_timeseries if "target" not in x] + progress.update() + if verbose: + logging.info( + "Baseline timeseries for {}: {}. test={}".format( + baseline_str, len(baseline_timeseries), test_name + ) + ) + logging.info( + "Comparison timeseries for {}: {}. test={}".format( + comparison_str, len(comparison_timeseries), test_name + ) + ) + if len(baseline_timeseries) > 1 and multi_value_baseline is False: + baseline_timeseries = get_only_Totals(baseline_timeseries) + + # Initialize result dictionary + result = { + "skip_test": False, + "no_datapoints_baseline": False, + "no_datapoints_comparison": False, + "no_datapoints_both": False, + "baseline_only": False, + "comparison_only": False, + "detected_regression": False, + "detected_improvement": False, + "unstable": False, + "should_add_line": False, + "line": None, + "percentage_change": 0.0, + "tested_groups": tested_groups, + "tested_commands": tested_commands, + "boxplot_data": None, + } + + if len(baseline_timeseries) == 0: + logging.warning( + f"No datapoints for test={test_name} for baseline timeseries {baseline_timeseries}" + ) + result["no_datapoints_baseline"] = True + result["no_datapoints_both"] = True + + if len(comparison_timeseries) == 0: + logging.warning( + f"No datapoints for test={test_name} for comparison timeseries {comparison_timeseries}" + ) + result["no_datapoints_comparison"] = True + result["no_datapoints_both"] = True + + if len(baseline_timeseries) != 1 and multi_value_baseline is False: + if verbose: + logging.warning( + "Skipping this test given the value of timeseries !=1. Baseline timeseries {}".format( + len(baseline_timeseries) + ) + ) + if len(baseline_timeseries) > 1: + logging.warning( + "\t\tTime-series: {}".format(", ".join(baseline_timeseries)) + ) + result["skip_test"] = True + return result + + if len(comparison_timeseries) > 1 and multi_value_comparison is False: + comparison_timeseries = get_only_Totals(comparison_timeseries) + if len(comparison_timeseries) != 1 and multi_value_comparison is False: + if verbose: + logging.warning( + "Comparison timeseries {}".format(len(comparison_timeseries)) + ) + result["skip_test"] = True + return result + + baseline_v = "N/A" + comparison_v = "N/A" + baseline_values = [] + baseline_datapoints = [] + comparison_values = [] + comparison_datapoints = [] + percentage_change = 0.0 + baseline_v_str = "N/A" + comparison_v_str = "N/A" + largest_variance = 0 + baseline_pct_change = "N/A" + comparison_pct_change = "N/A" + + note = "" + try: + for ts_name_baseline in baseline_timeseries: + datapoints_inner = rts.ts().revrange(ts_name_baseline, from_ts_ms, to_ts_ms) + baseline_datapoints.extend(datapoints_inner) + ( + baseline_pct_change, + baseline_v, + largest_variance, + ) = get_v_pct_change_and_largest_var( + baseline_datapoints, + baseline_pct_change, + baseline_v, + baseline_values, + largest_variance, + last_n_baseline, + verbose, + ) + for ts_name_comparison in comparison_timeseries: + datapoints_inner = rts.ts().revrange( + ts_name_comparison, from_ts_ms, to_ts_ms + ) + comparison_datapoints.extend(datapoints_inner) + + ( + comparison_pct_change, + comparison_v, + largest_variance, + ) = get_v_pct_change_and_largest_var( + comparison_datapoints, + comparison_pct_change, + comparison_v, + comparison_values, + largest_variance, + last_n_comparison, + verbose, + ) + + waterline = regressions_percent_lower_limit + # if regressions_percent_lower_limit < largest_variance: + # note = "waterline={:.1f}%.".format(largest_variance) + # waterline = largest_variance + + except redis.exceptions.ResponseError as e: + logging.error( + "Detected a redis.exceptions.ResponseError. {}".format(e.__str__()) + ) + pass + except ZeroDivisionError as e: + logging.error("Detected a ZeroDivisionError. {}".format(e.__str__())) + pass + + unstable = False + + if baseline_v != "N/A" and comparison_v == "N/A": + logging.warning( + f"Baseline contains datapoints but comparison not for test: {test_name}" + ) + result["baseline_only"] = True + if comparison_v != "N/A" and baseline_v == "N/A": + logging.warning( + f"Comparison contains datapoints but baseline not for test: {test_name}" + ) + result["comparison_only"] = True + if ( + baseline_v != "N/A" + and comparison_pct_change != "N/A" + and comparison_v != "N/A" + and baseline_pct_change != "N/A" + ): + if comparison_pct_change > 10.0 or baseline_pct_change > 10.0: + note = "UNSTABLE (very high variance)" + unstable = True + result["unstable"] = True + + baseline_v_str = prepare_value_str( + baseline_pct_change, + baseline_v, + baseline_values, + simplify_table, + metric_name, + ) + comparison_v_str = prepare_value_str( + comparison_pct_change, + comparison_v, + comparison_values, + simplify_table, + metric_name, + ) - if baseline_target_version is not None: - by_str_baseline = "target+version" - if baseline_covered: - baseline_by_arr.append(by_str_baseline) - logging.error(get_by_error("baseline", baseline_by_arr)) - exit(1) - baseline_covered = True - baseline_str = baseline_target_version + if metric_mode == "higher-better": + percentage_change = (float(comparison_v) / float(baseline_v) - 1) * 100.0 + else: + # lower-better + percentage_change = ( + -(float(baseline_v) - float(comparison_v)) / float(baseline_v) + ) * 100.0 - if baseline_hash is not None: - by_str_baseline = "hash" - if baseline_covered: - baseline_by_arr.append(by_str_baseline) - logging.error(get_by_error("baseline", baseline_by_arr)) - exit(1) - baseline_covered = True - baseline_str = baseline_hash - if baseline_target_branch is not None: - by_str_baseline = "target+branch" - if baseline_covered: - baseline_by_arr.append(by_str_baseline) - logging.error(get_by_error("baseline", baseline_by_arr)) - exit(1) - baseline_covered = True - baseline_str = baseline_target_branch + # Collect data for box plot + result["boxplot_data"] = (test_name, percentage_change) + else: + logging.warn( + f"Missing data for test {test_name}. baseline_v={baseline_v} (pct_change={baseline_pct_change}), comparison_v={comparison_v} (pct_change={comparison_pct_change}) " + ) - ################# COMPARISON BY .... + result["percentage_change"] = percentage_change - if comparison_branch is not None: - by_str_comparison = "branch" - comparison_covered = True - comparison_str = comparison_branch + if baseline_v != "N/A" or comparison_v != "N/A": + detected_regression = False + detected_improvement = False + noise_waterline = 3 - if comparison_tag is not None: - # check if we had already covered comparison - if comparison_covered: - logging.error( - "--comparison-branch and --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." - ) - exit(1) - comparison_covered = True - by_str_comparison = "version" - comparison_str = comparison_tag - if comparison_target_version is not None: - # check if we had already covered comparison - if comparison_covered: - logging.error( - "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." - ) - exit(1) - comparison_covered = True - by_str_comparison = "target+version" - comparison_str = comparison_target_version + # For higher-better metrics: negative change = regression, positive change = improvement + # For lower-better metrics: positive change = regression, negative change = improvement + if metric_mode == "higher-better": + # Higher is better: negative change is bad (regression), positive change is good (improvement) + if percentage_change < 0.0: + if -waterline >= percentage_change: + detected_regression = True + note = note + f" {regression_str}" + elif percentage_change < -noise_waterline: + if simplify_table is False: + note = note + f" potential {regression_str}" + else: + if simplify_table is False: + note = note + " No Change" - if comparison_target_branch is not None: - # check if we had already covered comparison - if comparison_covered: - logging.error( - "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." - ) - exit(1) - comparison_covered = True - by_str_comparison = "target+branch" - comparison_str = comparison_target_branch + if percentage_change > 0.0: + if percentage_change > waterline: + detected_improvement = True + note = note + f" {improvement_str}" + elif percentage_change > noise_waterline: + if simplify_table is False: + note = note + f" potential {improvement_str}" + else: + if simplify_table is False: + note = note + " No Change" + else: + # Lower is better: positive change is bad (regression), negative change is good (improvement) + if percentage_change > 0.0: + if percentage_change >= waterline: + detected_regression = True + note = note + f" {regression_str}" + elif percentage_change > noise_waterline: + if simplify_table is False: + note = note + f" potential {regression_str}" + else: + if simplify_table is False: + note = note + " No Change" - if comparison_hash is not None: - # check if we had already covered comparison - # if comparison_covered: - # logging.error( - # "--comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch, and --comparison-target-table are mutually exclusive. Pick one..." - # ) - # exit(1) - comparison_covered = True - by_str_comparison = "hash" - comparison_str = comparison_hash + if percentage_change < 0.0: + if -percentage_change > waterline: + detected_improvement = True + note = note + f" {improvement_str}" + elif -percentage_change > noise_waterline: + if simplify_table is False: + note = note + f" potential {improvement_str}" + else: + if simplify_table is False: + note = note + " No Change" - if baseline_covered is False: - logging.error( - "You need to provider either " - + "( --baseline-branch, --baseline-tag, --baseline-hash, --baseline-target-branch or --baseline-target-version ) " + result["detected_regression"] = detected_regression + result["detected_improvement"] = detected_improvement + + line = get_line( + baseline_v_str, + comparison_v_str, + note, + percentage_change, + test_link, ) - exit(1) - if comparison_covered is False: - logging.error( - "You need to provider either " - + "( --comparison-branch, --comparison-tag, --comparison-hash, --comparison-target-branch or --comparison-target-version ) " + result["line"] = line + else: + logging.warning( + "There were no datapoints both for baseline and comparison for test: {test_name}" ) - exit(1) - return baseline_str, by_str_baseline, comparison_str, by_str_comparison + result["no_datapoints_both"] = True + return result -def process_single_test_comparison( + +def process_single_env_comparison( + env_name, test_name, tests_with_config, original_metric_mode, @@ -1198,8 +1944,6 @@ def process_single_test_comparison( tf_triggering_env_baseline, tf_triggering_env_comparison, extra_filters, - baseline_deployment_name, - comparison_deployment_name, baseline_github_org, comparison_github_org, running_platform_baseline, @@ -1217,21 +1961,31 @@ def process_single_test_comparison( progress, ): """ - Process comparison analysis for a single test. - - Returns a dictionary containing all the results and side effects that need to be - accumulated by the caller. + Process comparison analysis for a single environment for a specific test. + This is similar to process_single_test_comparison but focuses on environment comparison. """ - tested_groups = [] - tested_commands = [] - if test_name in tests_with_config: - test_spec = tests_with_config[test_name] - if "tested-groups" in test_spec: - tested_groups = test_spec["tested-groups"] - if "tested-commands" in test_spec: - tested_commands = test_spec["tested-commands"] - else: - logging.error(f"Test does not contain spec info: {test_name}") + result = { + "baseline_only": False, + "comparison_only": False, + "unstable": False, + "detected_regression": False, + "detected_improvement": False, + "no_datapoints_both": False, + "percentage_change": "N/A", + "line": [], + "boxplot_data": None, + } + + baseline_datapoints = [] + comparison_datapoints = [] + baseline_values = [] + comparison_values = [] + baseline_v = "N/A" + comparison_v = "N/A" + baseline_pct_change = "N/A" + comparison_pct_change = "N/A" + percentage_change = "N/A" + largest_variance = 0.0 metric_mode = original_metric_mode compare_version = "main" @@ -1242,136 +1996,68 @@ def process_single_test_comparison( multi_value_baseline = check_multi_value_filter(baseline_str) multi_value_comparison = check_multi_value_filter(comparison_str) + # Build filters for baseline environment filters_baseline = [ "metric={}".format(metric_name), "{}={}".format(test_filter, test_name), "github_repo={}".format(baseline_github_repo), "triggering_env={}".format(tf_triggering_env_baseline), + "deployment_name={}".format(env_name), # Use env_name as deployment_name ] if extra_filters != "": filters_baseline.append(extra_filters) if baseline_str != "": filters_baseline.append("{}={}".format(by_str_baseline, baseline_str)) - if baseline_deployment_name != "": - filters_baseline.append("deployment_name={}".format(baseline_deployment_name)) if baseline_github_org != "": filters_baseline.append(f"github_org={baseline_github_org}") if running_platform_baseline is not None and running_platform_baseline != "": filters_baseline.append("running_platform={}".format(running_platform_baseline)) + + # Build filters for comparison environment (same environment, different branch/version) filters_comparison = [ "metric={}".format(metric_name), "{}={}".format(test_filter, test_name), "github_repo={}".format(comparison_github_repo), "triggering_env={}".format(tf_triggering_env_comparison), + "deployment_name={}".format(env_name), # Use same env_name ] if comparison_str != "": filters_comparison.append("{}={}".format(by_str_comparison, comparison_str)) - if comparison_deployment_name != "": - filters_comparison.append( - "deployment_name={}".format(comparison_deployment_name) - ) if extra_filters != "": filters_comparison.append(extra_filters) if comparison_github_org != "": - filters_comparison.append(f"github_org={comparison_github_org}") - if "hash" not in by_str_baseline: - filters_baseline.append("hash==") - if "hash" not in by_str_comparison: - filters_comparison.append("hash==") - if running_platform_comparison is not None and running_platform_comparison != "": - filters_comparison.append( - "running_platform={}".format(running_platform_comparison) - ) - baseline_timeseries = rts.ts().queryindex(filters_baseline) - comparison_timeseries = rts.ts().queryindex(filters_comparison) - - # avoiding target time-series - comparison_timeseries = [x for x in comparison_timeseries if "target" not in x] - baseline_timeseries = [x for x in baseline_timeseries if "target" not in x] - progress.update() - if verbose: - logging.info( - "Baseline timeseries for {}: {}. test={}".format( - baseline_str, len(baseline_timeseries), test_name - ) - ) - logging.info( - "Comparison timeseries for {}: {}. test={}".format( - comparison_str, len(comparison_timeseries), test_name - ) - ) - if len(baseline_timeseries) > 1 and multi_value_baseline is False: - baseline_timeseries = get_only_Totals(baseline_timeseries) - - # Initialize result dictionary - result = { - "skip_test": False, - "no_datapoints_baseline": False, - "no_datapoints_comparison": False, - "no_datapoints_both": False, - "baseline_only": False, - "comparison_only": False, - "detected_regression": False, - "detected_improvement": False, - "unstable": False, - "should_add_line": False, - "line": None, - "percentage_change": 0.0, - "tested_groups": tested_groups, - "tested_commands": tested_commands, - "boxplot_data": None, - } - - if len(baseline_timeseries) == 0: - logging.warning( - f"No datapoints for test={test_name} for baseline timeseries {baseline_timeseries}" + filters_comparison.append(f"github_org={comparison_github_org}") + if "hash" not in by_str_baseline: + filters_baseline.append("hash==") + if "hash" not in by_str_comparison: + filters_comparison.append("hash==") + if running_platform_comparison is not None and running_platform_comparison != "": + filters_comparison.append( + "running_platform={}".format(running_platform_comparison) ) - result["no_datapoints_baseline"] = True - result["no_datapoints_both"] = True - if len(comparison_timeseries) == 0: - logging.warning( - f"No datapoints for test={test_name} for comparison timeseries {comparison_timeseries}" - ) - result["no_datapoints_comparison"] = True - result["no_datapoints_both"] = True + baseline_timeseries = rts.ts().queryindex(filters_baseline) + comparison_timeseries = rts.ts().queryindex(filters_comparison) - if len(baseline_timeseries) != 1 and multi_value_baseline is False: - if verbose: - logging.warning( - "Skipping this test given the value of timeseries !=1. Baseline timeseries {}".format( - len(baseline_timeseries) - ) + # avoiding target time-series + comparison_timeseries = [x for x in comparison_timeseries if "target" not in x] + baseline_timeseries = [x for x in baseline_timeseries if "target" not in x] + progress.update() + if verbose: + logging.info( + "Baseline timeseries for {} (env: {}): {}. test={}".format( + baseline_str, env_name, len(baseline_timeseries), test_name ) - if len(baseline_timeseries) > 1: - logging.warning( - "\t\tTime-series: {}".format(", ".join(baseline_timeseries)) - ) - result["skip_test"] = True - return result - + ) + logging.info( + "Comparison timeseries for {} (env: {}): {}. test={}".format( + comparison_str, env_name, len(comparison_timeseries), test_name + ) + ) + if len(baseline_timeseries) > 1 and multi_value_baseline is False: + baseline_timeseries = get_only_Totals(baseline_timeseries) if len(comparison_timeseries) > 1 and multi_value_comparison is False: comparison_timeseries = get_only_Totals(comparison_timeseries) - if len(comparison_timeseries) != 1 and multi_value_comparison is False: - if verbose: - logging.warning( - "Comparison timeseries {}".format(len(comparison_timeseries)) - ) - result["skip_test"] = True - return result - - baseline_v = "N/A" - comparison_v = "N/A" - baseline_values = [] - baseline_datapoints = [] - comparison_values = [] - comparison_datapoints = [] - percentage_change = 0.0 - baseline_v_str = "N/A" - comparison_v_str = "N/A" - largest_variance = 0 - baseline_pct_change = "N/A" - comparison_pct_change = "N/A" note = "" try: @@ -1412,9 +2098,6 @@ def process_single_test_comparison( ) waterline = regressions_percent_lower_limit - # if regressions_percent_lower_limit < largest_variance: - # note = "waterline={:.1f}%.".format(largest_variance) - # waterline = largest_variance except redis.exceptions.ResponseError as e: logging.error( @@ -1429,12 +2112,12 @@ def process_single_test_comparison( if baseline_v != "N/A" and comparison_v == "N/A": logging.warning( - f"Baseline contains datapoints but comparison not for test: {test_name}" + f"Baseline contains datapoints but comparison not for env: {env_name}, test: {test_name}" ) result["baseline_only"] = True if comparison_v != "N/A" and baseline_v == "N/A": logging.warning( - f"Comparison contains datapoints but baseline not for test: {test_name}" + f"Comparison contains datapoints but baseline not for env: {env_name}, test: {test_name}" ) result["comparison_only"] = True if ( @@ -1472,67 +2155,45 @@ def process_single_test_comparison( ) * 100.0 # Collect data for box plot - result["boxplot_data"] = (test_name, percentage_change) + result["boxplot_data"] = (env_name, percentage_change) else: logging.warn( - f"Missing data for test {test_name}. baseline_v={baseline_v} (pct_change={baseline_pct_change}), comparison_v={comparison_v} (pct_change={comparison_pct_change}) " + f"Missing data for env {env_name}, test {test_name}. baseline_v={baseline_v} (pct_change={baseline_pct_change}), comparison_v={comparison_v} (pct_change={comparison_pct_change}) " ) result["percentage_change"] = percentage_change - if baseline_v != "N/A" or comparison_v != "N/A": + if ( + baseline_v != "N/A" + and comparison_v != "N/A" + and percentage_change != "N/A" + and unstable is False + ): detected_regression = False detected_improvement = False - noise_waterline = 3 - - # For higher-better metrics: negative change = regression, positive change = improvement - # For lower-better metrics: positive change = regression, negative change = improvement - if metric_mode == "higher-better": - # Higher is better: negative change is bad (regression), positive change is good (improvement) - if percentage_change < 0.0: - if -waterline >= percentage_change: - detected_regression = True - note = note + f" {regression_str}" - elif percentage_change < -noise_waterline: - if simplify_table is False: - note = note + f" potential {regression_str}" - else: - if simplify_table is False: - note = note + " No Change" - - if percentage_change > 0.0: - if percentage_change > waterline: - detected_improvement = True - note = note + f" {improvement_str}" - elif percentage_change > noise_waterline: - if simplify_table is False: - note = note + f" potential {improvement_str}" - else: - if simplify_table is False: - note = note + " No Change" - else: - # Lower is better: positive change is bad (regression), negative change is good (improvement) - if percentage_change > 0.0: - if percentage_change >= waterline: - detected_regression = True - note = note + f" {regression_str}" - elif percentage_change > noise_waterline: - if simplify_table is False: - note = note + f" potential {regression_str}" - else: - if simplify_table is False: - note = note + " No Change" - - if percentage_change < 0.0: - if -percentage_change > waterline: - detected_improvement = True - note = note + f" {improvement_str}" - elif -percentage_change > noise_waterline: - if simplify_table is False: - note = note + f" potential {improvement_str}" - else: - if simplify_table is False: - note = note + " No Change" + noise_waterline = 1.0 + + if percentage_change > 0.0: + if percentage_change > waterline: + detected_regression = True + note = note + f" {regression_str}" + elif percentage_change > noise_waterline: + if simplify_table is False: + note = note + f" potential {regression_str}" + else: + if simplify_table is False: + note = note + " No Change" + + if percentage_change < 0.0: + if -percentage_change > waterline: + detected_improvement = True + note = note + f" {improvement_str}" + elif -percentage_change > noise_waterline: + if simplify_table is False: + note = note + f" potential {improvement_str}" + else: + if simplify_table is False: + note = note + " No Change" result["detected_regression"] = detected_regression result["detected_improvement"] = detected_improvement @@ -1542,18 +2203,201 @@ def process_single_test_comparison( comparison_v_str, note, percentage_change, - test_link, + env_name, # Use env_name instead of test_link for environment comparison ) result["line"] = line else: logging.warning( - "There were no datapoints both for baseline and comparison for test: {test_name}" + f"There were no datapoints both for baseline and comparison for env: {env_name}, test: {test_name}" ) result["no_datapoints_both"] = True return result +def from_rts_to_env_comparison_table( + test_name, + env_list, + baseline_str, + comparison_str, + by_str_baseline, + by_str_comparison, + from_ts_ms, + to_ts_ms, + last_n_baseline, + last_n_comparison, + metric_mode, + metric_name, + print_improvements_only, + print_regressions_only, + skip_unstable, + regressions_percent_lower_limit, + rts, + simplify_table, + test_filter, + tf_triggering_env_baseline, + tf_triggering_env_comparison, + verbose, + running_platform_baseline=None, + running_platform_comparison=None, + baseline_github_repo="redis", + comparison_github_repo="redis", + baseline_github_org="redis", + comparison_github_org="redis", + regression_str="REGRESSION", + improvement_str="IMPROVEMENT", + tests_with_config={}, + extra_filters="", +): + """ + Compare environments for a single test instead of tests for a single environment. + Returns comparison data organized by environment. + """ + original_metric_mode = metric_mode + + # Initialize result containers + table_full = [] + table_stable = [] + table_unstable = [] + table_improvements = [] + table_regressions = [] + regressions_list = [] + improvements_list = [] + unstable_list = [] + baseline_only_list = [] + comparison_only_list = [] + no_datapoints_list = [] + boxplot_data = [] + + total_improvements = 0 + total_regressions = 0 + total_stable = 0 + total_unstable = 0 + total_comparison_points = 0 + detected_regressions = False + + # Progress tracking + from tqdm import tqdm + + progress = tqdm(total=len(env_list), desc=f"Processing envs for {test_name[:30]}") + + def process_env_wrapper(env_name): + """Wrapper function to process a single environment and return env_name with result""" + result = process_single_env_comparison( + env_name, + test_name, + tests_with_config, + original_metric_mode, + baseline_str, + comparison_str, + by_str_baseline, + by_str_comparison, + metric_name, + test_filter, + baseline_github_repo, + comparison_github_repo, + tf_triggering_env_baseline, + tf_triggering_env_comparison, + extra_filters, + baseline_github_org, + comparison_github_org, + running_platform_baseline, + running_platform_comparison, + rts, + from_ts_ms, + to_ts_ms, + last_n_baseline, + last_n_comparison, + verbose, + regressions_percent_lower_limit, + simplify_table, + regression_str, + improvement_str, + progress, + ) + return env_name, result + + # Process all environments + for env_name in env_list: + env_name, result = process_env_wrapper(env_name) + + if result["no_datapoints_both"]: + no_datapoints_list.append(env_name) + continue + + if result["baseline_only"]: + baseline_only_list.append(env_name) + continue + + if result["comparison_only"]: + comparison_only_list.append(env_name) + continue + + if result["unstable"]: + unstable_list.append((env_name, result["line"])) + table_unstable.append(result["line"]) + total_unstable += 1 + if skip_unstable is False: + table_full.append(result["line"]) + total_comparison_points += 1 + continue + + if result["detected_regression"]: + detected_regressions = True + regressions_list.append((env_name, result["percentage_change"])) + table_regressions.append(result["line"]) + total_regressions += 1 + + if result["detected_improvement"]: + improvements_list.append((env_name, result["percentage_change"])) + table_improvements.append(result["line"]) + total_improvements += 1 + + if ( + not result["detected_regression"] + and not result["detected_improvement"] + and not result["unstable"] + ): + total_stable += 1 + table_stable.append(result["line"]) + + # Add to full table if not filtered out + if not ( + print_improvements_only and not result["detected_improvement"] + ) and not (print_regressions_only and not result["detected_regression"]): + table_full.append(result["line"]) + total_comparison_points += 1 + + # Collect boxplot data + if result["boxplot_data"]: + boxplot_data.append(result["boxplot_data"]) + + progress.close() + + return ( + detected_regressions, + table_full, + table_stable, + table_unstable, + table_improvements, + table_regressions, + total_improvements, + total_regressions, + total_stable, + total_unstable, + total_comparison_points, + regressions_list, + improvements_list, + unstable_list, + baseline_only_list, + comparison_only_list, + no_datapoints_list, + False, # group_change (not applicable for env comparison) + False, # command_change (not applicable for env comparison) + boxplot_data, + ) + + def from_rts_to_regression_table( baseline_deployment_name, comparison_deployment_name, @@ -1795,6 +2639,8 @@ def get_only_Totals(baseline_timeseries): ) new_base = [] for ts_name in baseline_timeseries: + if "8.5.0" in ts_name: + continue if "io-threads" in ts_name: continue if "oss-cluster" in ts_name: