|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import argparse |
| 4 | +import json |
| 5 | +from collections import defaultdict |
| 6 | +import os |
| 7 | +import subprocess |
| 8 | +import plotly.graph_objects as go |
| 9 | +from plotly.subplots import make_subplots |
| 10 | +from datetime import datetime, timezone |
| 11 | +from telegram.alert_queued_jobs import send_telegram_message, get_current_workflow_url, get_alert_logins |
| 12 | + |
| 13 | + |
| 14 | +def timstamp_to_time(ts): |
| 15 | + return datetime.fromtimestamp(ts, timezone.utc).strftime('%Y-%m-%d %H:%M:%S') |
| 16 | + |
| 17 | + |
| 18 | +def get_total_runner_memory_in_gb(): |
| 19 | + cmd = [ |
| 20 | + 'awk', '/MemTotal/ { printf "%.3f \\n", $2/1024/1024 }', '/proc/meminfo' |
| 21 | + ] |
| 22 | + return float(subprocess.run(cmd, text=True, capture_output=True, timeout=60).stdout) |
| 23 | + |
| 24 | + |
| 25 | +def calculate_total_memory_consumption(processes): |
| 26 | + """ |
| 27 | + Вычисляет суммарное потребление памяти для каждого момента времени. |
| 28 | + Корректно работает с несортированными данными. |
| 29 | +
|
| 30 | + Args: |
| 31 | + processes: список кортежей (rss_consumption, path, start_time, end_time) |
| 32 | +
|
| 33 | + Returns: |
| 34 | + timeline: отсортированный список временных меток |
| 35 | + memory_usage: список суммарного потребления памяти для каждой метки |
| 36 | + """ |
| 37 | + processes = sorted(processes, key=lambda x: x[2]) |
| 38 | + events = defaultdict(float) |
| 39 | + |
| 40 | + for rss, path, start, end in processes: |
| 41 | + events[start] += rss # При старте добавляем память |
| 42 | + events[end] -= rss # При завершении убираем память |
| 43 | + |
| 44 | + sorted_events = sorted(events.items(), key=lambda x: x[0]) |
| 45 | + timeline = [] |
| 46 | + memory_usage = [] |
| 47 | + current_memory = 0 |
| 48 | + |
| 49 | + for timestamp, delta in sorted_events: |
| 50 | + current_memory += delta # Применяем изменение |
| 51 | + timeline.append(timestamp) |
| 52 | + memory_usage.append(round(current_memory, 2)) |
| 53 | + return timeline, memory_usage |
| 54 | + |
| 55 | + |
| 56 | +def get_active_processes_at_time(processes, target_time): |
| 57 | + """ |
| 58 | + Возвращает список процессов, активных в указанный момент времени |
| 59 | + """ |
| 60 | + active = [] |
| 61 | + for rss, path, start, end in processes: |
| 62 | + if start <= target_time < end: |
| 63 | + active.append((rss, path, start, round(end))) |
| 64 | + return active |
| 65 | + |
| 66 | + |
| 67 | +def create_simple_interactive_plot(processes, output_file): |
| 68 | + """Упрощённая версия с hover-информацией""" |
| 69 | + timeline, memory_usage = calculate_total_memory_consumption(processes) |
| 70 | + |
| 71 | + # Создаём subplot с дополнительной информацией |
| 72 | + fig = make_subplots( |
| 73 | + rows=1, cols=1, |
| 74 | + row_heights=[1,], |
| 75 | + subplot_titles=('Memory Consumption',), |
| 76 | + vertical_spacing=0.12 |
| 77 | + ) |
| 78 | + |
| 79 | + # Готовим hover-текст с информацией об активных процессах |
| 80 | + hover_texts = [] |
| 81 | + process_counts = [] |
| 82 | + timeline_in_time = list(map(timstamp_to_time, timeline)) |
| 83 | + for t, mem in zip(timeline, memory_usage): |
| 84 | + active = get_active_processes_at_time(processes, t) |
| 85 | + process_counts.append(len(active)) |
| 86 | + test_suites = defaultdict(float) |
| 87 | + for rss, path, _, _ in active: |
| 88 | + test_suites[path.split(' ')[0]] += rss |
| 89 | + test_suites = sorted(test_suites.items(), |
| 90 | + key=lambda x: x[1], reverse=True) |
| 91 | + |
| 92 | + hover_text = f"<b>Time:</b> {timstamp_to_time(t)}<br>" |
| 93 | + hover_text += f"<b>Memory:</b> {mem} GB<br>" |
| 94 | + hover_text += f"<b>Processes:</b> {len(active)}<br><br>" |
| 95 | + |
| 96 | + if active: |
| 97 | + hover_text += "<b>Top 5 Test Suites:</b><br>" |
| 98 | + for suite, rss in test_suites[:5]: |
| 99 | + hover_text += f" • {suite}: {round(rss, 2)} GB<br>" |
| 100 | + |
| 101 | + hover_texts.append(hover_text) |
| 102 | + |
| 103 | + # График памяти |
| 104 | + fig.add_trace( |
| 105 | + go.Scatter( |
| 106 | + x=timeline_in_time, |
| 107 | + y=memory_usage, |
| 108 | + mode='lines', |
| 109 | + name='Total RSS', |
| 110 | + line=dict(shape='hv', width=1, color='rgb(46, 134, 171)'), |
| 111 | + fill='tozeroy', |
| 112 | + fillcolor='rgba(46, 134, 171, 0.3)', |
| 113 | + hovertext=hover_texts, |
| 114 | + hoverinfo='text' |
| 115 | + ), |
| 116 | + row=1, col=1 |
| 117 | + ) |
| 118 | + fig.add_trace( |
| 119 | + go.Scatter( |
| 120 | + x=timeline_in_time, |
| 121 | + y=process_counts, |
| 122 | + mode='lines', |
| 123 | + name='Active processes', |
| 124 | + line=dict(shape='hv', width=1, color='rgb(171, 134, 46)'), |
| 125 | + ), |
| 126 | + row=1, col=1 |
| 127 | + ) |
| 128 | + |
| 129 | + # Отмечаем пик |
| 130 | + max_memory = max(memory_usage) |
| 131 | + max_idx = memory_usage.index(max_memory) |
| 132 | + max_time = timeline[max_idx] |
| 133 | + |
| 134 | + if not output_file: |
| 135 | + print(hover_texts[max_idx].replace('<br>', '\n')) |
| 136 | + |
| 137 | + fig.add_trace( |
| 138 | + go.Scatter( |
| 139 | + x=[timstamp_to_time(max_time)], |
| 140 | + y=[max_memory], |
| 141 | + mode='markers+text', |
| 142 | + marker=dict(size=15, color='red', symbol='star'), |
| 143 | + text=[f'Peak: {max_memory} GB'], |
| 144 | + textposition='top center', |
| 145 | + name='Peak', |
| 146 | + showlegend=False |
| 147 | + ), |
| 148 | + row=1, col=1 |
| 149 | + ) |
| 150 | + |
| 151 | + fig.update_yaxes(title_text="Memory (GB)", row=1, col=1) |
| 152 | + |
| 153 | + fig.update_layout( |
| 154 | + height=800, |
| 155 | + hovermode='x unified', |
| 156 | + template='plotly_white', |
| 157 | + title_text="Interactive Memory Consumption Monitor" |
| 158 | + ) |
| 159 | + if output_file: |
| 160 | + fig.write_html(output_file) |
| 161 | + else: |
| 162 | + fig.show() |
| 163 | + return max_memory |
| 164 | + |
| 165 | + |
| 166 | +def parse_report_file(report_json): |
| 167 | + all = [] |
| 168 | + for result in report_json["results"]: |
| 169 | + type_ = result["type"] |
| 170 | + if type_ == "test" and result.get("chunk"): |
| 171 | + rss_consumtion = result["metrics"].get( |
| 172 | + "suite_max_proc_tree_memory_consumption_kb", 0) / 1024 / 1024 |
| 173 | + start_time = result["metrics"].get('suite_start_timestamp', 0) |
| 174 | + end_time = start_time + result["metrics"].get("wall_time", 0) |
| 175 | + path = result["path"] + " " + result.get("subtest_name", "") |
| 176 | + all.append((rss_consumtion, path, start_time, end_time)) |
| 177 | + return all |
| 178 | + |
| 179 | + |
| 180 | +if __name__ == "__main__": |
| 181 | + parser = argparse.ArgumentParser() |
| 182 | + parser.add_argument( |
| 183 | + "--report-file", |
| 184 | + help="path to file received via 'ya make ... --build-results-report <file>'", |
| 185 | + type=argparse.FileType("r"), |
| 186 | + ) |
| 187 | + parser.add_argument( |
| 188 | + "--output-file", |
| 189 | + help="path to graph file" |
| 190 | + ) |
| 191 | + parser.add_argument( |
| 192 | + "--output-file-url", |
| 193 | + help="Path to graph file in run artifacts" |
| 194 | + ) |
| 195 | + parser.add_argument('--dry-run', action='store_true', |
| 196 | + help='Debug mode without sending to Telegram') |
| 197 | + parser.add_argument('--bot-token', |
| 198 | + help='Telegram bot token (or use TELEGRAM_BOT_TOKEN env var)') |
| 199 | + parser.add_argument('--chat-id', |
| 200 | + help='Telegram chat ID') |
| 201 | + parser.add_argument('--channel', |
| 202 | + help='Telegram channel ID (alternative to --chat-id)') |
| 203 | + parser.add_argument('--thread-id', type=int, |
| 204 | + help='Telegram thread ID for group messages') |
| 205 | + parser.add_argument('--memory-threshold', type=float, |
| 206 | + help='Threshold for used memory in percent. Default = 90', |
| 207 | + default=90) |
| 208 | + args = parser.parse_args() |
| 209 | + |
| 210 | + report_file = args.report_file |
| 211 | + obj = json.load(report_file) |
| 212 | + all = parse_report_file(obj) |
| 213 | + output_file = args.output_file |
| 214 | + |
| 215 | + # Draw or export fig with RAM usage |
| 216 | + max_used_ram = create_simple_interactive_plot(all, output_file) |
| 217 | + |
| 218 | + max_agent_ram = get_total_runner_memory_in_gb() |
| 219 | + max_agent_ram_with_threshold = max_agent_ram * (args.memory_threshold / 100) |
| 220 | + if max_used_ram > max_agent_ram_with_threshold: |
| 221 | + print(f"Max used RAM {max_used_ram} is greater than max agent RAM {max_agent_ram}") |
| 222 | + |
| 223 | + bot_token = args.bot_token or os.getenv('TELEGRAM_BOT_TOKEN') |
| 224 | + chat_id = args.channel or args.chat_id or os.getenv('TELEGRAM_CHAT_ID') |
| 225 | + thread_id = args.thread_id or os.getenv('TELEGRAM_THREAD_ID') |
| 226 | + dry_run = args.dry_run or os.getenv('DRY_RUN', 'false').lower() == 'true' |
| 227 | + |
| 228 | + if not bot_token or not chat_id: |
| 229 | + print('No bot-token or chat-id was set. Forcing dry-run mode') |
| 230 | + dry_run = True |
| 231 | + |
| 232 | + message = f"""🚨 *Possible OOM* |
| 233 | +During [RUN]({get_current_workflow_url()}) max used RAM *{round(max_used_ram, 1)}GB* is greater than agent RAM *{round(max_agent_ram_with_threshold, 1)}GB* |
| 234 | +{max_agent_ram}GB total |
| 235 | +Threshold is {args.memory_threshold}% |
| 236 | +
|
| 237 | +[Ram usage graph]({args.output_file_url}) |
| 238 | +CC {get_alert_logins()}""" |
| 239 | + if dry_run: |
| 240 | + print(message) |
| 241 | + else: |
| 242 | + if chat_id and not chat_id.startswith('-') and len(chat_id) >= 10: |
| 243 | + # Add -100 prefix for supergroup |
| 244 | + chat_id = f"-100{chat_id}" |
| 245 | + send_telegram_message( |
| 246 | + bot_token, |
| 247 | + chat_id, |
| 248 | + message, |
| 249 | + thread_id, |
| 250 | + "MarkdownV2") |
0 commit comments