Skip to content

Commit c03e592

Browse files
authored
Merge pull request #8559 from shirady/nsfs-nc-disable-Prometheus-server-start
NC | NSFS | Wrap with `try-catch` Prometheus Reporting `start_server`
2 parents e9808eb + 09c4c45 commit c03e592

File tree

4 files changed

+46
-15
lines changed

4 files changed

+46
-15
lines changed

src/endpoint/endpoint.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,17 @@ async function main(options = {}) {
118118
// the primary just forks and returns, workers will continue to serve
119119
fork_count = options.forks ?? config.ENDPOINT_FORKS;
120120
const metrics_port = options.metrics_port || config.EP_METRICS_SERVER_PORT;
121-
if (fork_utils.start_workers(metrics_port, fork_count)) return;
121+
/**
122+
* Please notice that we can run the main in 2 states:
123+
* 1. Only the primary process runs the main (fork is 0 or undefined) - everything that
124+
* is implemented here would be run by this process.
125+
* 2. A primary process with multiple forks (IMPORTANT) - if there is implementation that
126+
* in only relevant to the primary process it should be implemented in
127+
* fork_utils.start_workers because the primary process returns after start_workers
128+
* and the forks will continue executing the code lines in this function
129+
* */
130+
const is_workers_started_from_primary = await fork_utils.start_workers(metrics_port, fork_count);
131+
if (is_workers_started_from_primary) return;
122132

123133
const endpoint_group_id = process.env.ENDPOINT_GROUP_ID || 'default-endpoint-group';
124134

src/manage_nsfs/diagnose.js

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,17 @@ async function gather_metrics() {
5959
const buffer = await buffer_utils.read_stream_join(res);
6060
const body = buffer.toString('utf8');
6161
metrics_output = JSON.parse(body);
62+
if (!metrics_output) throw new Error('received empty metrics response', { cause: res.statusCode });
63+
write_stdout_response(ManageCLIResponse.MetricsStatus, metrics_output);
64+
} else if (res.statusCode >= 500 && res.rawHeaders.includes('application/json')) {
65+
const buffer = await buffer_utils.read_stream_join(res);
66+
const body = buffer.toString('utf8');
67+
const error_output = JSON.parse(body);
68+
if (!error_output) throw new Error('received empty metrics response', { cause: res.statusCode });
69+
throw_cli_error({ ...ManageCLIError.MetricsStatusFailed, ...error_output });
70+
} else {
71+
throw new Error('received empty metrics response', { cause: res.statusCode });
6272
}
63-
if (!metrics_output) throw new Error('recieved empty metrics response', { cause: res.statusCode });
64-
write_stdout_response(ManageCLIResponse.MetricsStatus, metrics_output);
6573
} catch (err) {
6674
dbg.warn('could not receive metrics response', err);
6775
throw_cli_error({ ...ManageCLIError.MetricsStatusFailed, cause: err?.errors?.[0] || err });

src/server/analytic_services/prometheus_reporting.js

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,7 @@ async function start_server(
6161
const server = http.createServer(async (req, res) => {
6262
// Serve all metrics on the root path for system that do have one or more fork running.
6363
if (fork_enabled) {
64-
const metrics = await aggregatorRegistry.clusterMetrics();
65-
if (req.url === '' || req.url === '/') {
66-
res.writeHead(200, { 'Content-Type': aggregatorRegistry.contentType });
67-
res.end(metrics);
68-
return;
69-
}
64+
// we would like this part to be first as clusterMetrics might fail.
7065
if (req.url === '/metrics/nsfs_stats') {
7166
res.writeHead(200, { 'Content-Type': 'text/plain' });
7267
const nsfs_report = {
@@ -77,6 +72,24 @@ async function start_server(
7772
res.end(JSON.stringify(nsfs_report));
7873
return;
7974
}
75+
let metrics;
76+
try {
77+
metrics = await aggregatorRegistry.clusterMetrics();
78+
} catch (err) {
79+
dbg.error('start_server: Could not get the metrics, got an error', err);
80+
res.writeHead(504, { 'Content-Type': 'application/json' });
81+
const reply = JSON.stringify({
82+
error: 'Internal server error - timeout',
83+
message: 'Looks like the server is taking a long time to respond (Could not get the metrics)',
84+
});
85+
res.end(reply);
86+
return;
87+
}
88+
if (req.url === '' || req.url === '/') {
89+
res.writeHead(200, { 'Content-Type': aggregatorRegistry.contentType });
90+
res.end(metrics);
91+
return;
92+
}
8093
// Serve report's metrics on the report name path
8194
const report_name = req.url.substr(1);
8295
const single_metrics = export_single_metrics(metrics, report_name);
@@ -165,7 +178,7 @@ async function metrics_nsfs_stats_handler() {
165178
op_stats_counters: op_stats_counters,
166179
fs_worker_stats_counters: fs_worker_stats_counters
167180
};
168-
dbg.log1(`_create_nsfs_report: nsfs_report ${nsfs_report}`);
181+
dbg.log1('_create_nsfs_report: nsfs_report', nsfs_report);
169182
return JSON.stringify(nsfs_report);
170183
}
171184

src/util/fork_utils.js

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ const fs_workers_stats = {};
3030
*
3131
* @param {number?} count number of workers to start.
3232
* @param {number?} metrics_port prometheus metris port.
33-
* @returns {boolean} true if workers were started.
33+
* @returns {Promise<boolean>} true if workers were started.
3434
*/
35-
function start_workers(metrics_port, count = 0) {
35+
async function start_workers(metrics_port, count = 0) {
3636
const exit_events = [];
3737
if (cluster.isPrimary && count > 0) {
3838
for (let i = 0; i < count; ++i) {
@@ -68,12 +68,12 @@ function start_workers(metrics_port, count = 0) {
6868
});
6969
for (const id in cluster.workers) {
7070
if (id) {
71-
cluster.workers[id].on('message', nsfs_io_state_handler);
71+
cluster.workers[id].on('message', nsfs_io_stats_handler);
7272
}
7373
}
7474
if (metrics_port > 0) {
7575
dbg.log0('Starting metrics server', metrics_port);
76-
prom_reporting.start_server(metrics_port, true);
76+
await prom_reporting.start_server(metrics_port, true);
7777
dbg.log0('Started metrics server successfully');
7878
}
7979
return true;
@@ -82,7 +82,7 @@ function start_workers(metrics_port, count = 0) {
8282
return false;
8383
}
8484

85-
function nsfs_io_state_handler(msg) {
85+
function nsfs_io_stats_handler(msg) {
8686
if (msg.io_stats) {
8787
for (const [key, value] of Object.entries(msg.io_stats)) {
8888
io_stats[key] += value;

0 commit comments

Comments
 (0)