Skip to content

Commit 16febbd

Browse files
authored
feat: add CPU, memory and node status info to cluster_info (#6897)
* feat: add CPU and memory info to `cluster_info` Signed-off-by: WenyXu <[email protected]> * feat: add `node_status` to `cluster_info` table Signed-off-by: WenyXu <[email protected]> * test: update sqlness Signed-off-by: WenyXu <[email protected]> * chore: apply suggestions Signed-off-by: WenyXu <[email protected]> * chore: update proto Signed-off-by: WenyXu <[email protected]> --------- Signed-off-by: WenyXu <[email protected]>
1 parent 47384c7 commit 16febbd

File tree

24 files changed

+284
-69
lines changed

24 files changed

+284
-69
lines changed

Cargo.lock

Lines changed: 3 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ etcd-client = { git = "https://github.com/GreptimeTeam/etcd-client", rev = "f62d
145145
fst = "0.4.7"
146146
futures = "0.3"
147147
futures-util = "0.3"
148-
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "66eb089afa6baaa3ddfafabd0a4abbe317d012c3" }
148+
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "f9836cf8aab30e672f640c6ef4c1cfd2cf9fbc36" }
149149
hex = "0.4"
150150
http = "1"
151151
humantime = "2.1"

src/catalog/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ common-runtime.workspace = true
3232
common-telemetry.workspace = true
3333
common-time.workspace = true
3434
common-version.workspace = true
35+
common-workload.workspace = true
3536
dashmap.workspace = true
3637
datafusion.workspace = true
3738
datatypes.workspace = true
@@ -48,6 +49,7 @@ prometheus.workspace = true
4849
promql-parser.workspace = true
4950
rand.workspace = true
5051
rustc-hash.workspace = true
52+
serde.workspace = true
5153
serde_json.workspace = true
5254
session.workspace = true
5355
snafu.workspace = true

src/catalog/src/system_schema/information_schema/cluster_info.rs

Lines changed: 85 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,11 @@ use std::time::Duration;
1818
use arrow_schema::SchemaRef as ArrowSchemaRef;
1919
use common_catalog::consts::INFORMATION_SCHEMA_CLUSTER_INFO_TABLE_ID;
2020
use common_error::ext::BoxedError;
21-
use common_meta::cluster::NodeInfo;
21+
use common_meta::cluster::{DatanodeStatus, NodeInfo, NodeStatus};
2222
use common_recordbatch::adapter::RecordBatchStreamAdapter;
2323
use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
2424
use common_time::timestamp::Timestamp;
25+
use common_workload::DatanodeWorkloadType;
2526
use datafusion::execution::TaskContext;
2627
use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
2728
use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatchStreamAdapter;
@@ -32,7 +33,9 @@ use datatypes::timestamp::TimestampMillisecond;
3233
use datatypes::value::Value;
3334
use datatypes::vectors::{
3435
Int64VectorBuilder, StringVectorBuilder, TimestampMillisecondVectorBuilder,
36+
UInt32VectorBuilder, UInt64VectorBuilder,
3537
};
38+
use serde::Serialize;
3639
use snafu::ResultExt;
3740
use store_api::storage::{ScanRequest, TableId};
3841

@@ -41,14 +44,20 @@ use crate::error::{CreateRecordBatchSnafu, InternalSnafu, Result};
4144
use crate::system_schema::information_schema::{CLUSTER_INFO, InformationTable, Predicates};
4245
use crate::system_schema::utils;
4346

47+
const PEER_TYPE_FRONTEND: &str = "FRONTEND";
48+
const PEER_TYPE_METASRV: &str = "METASRV";
49+
4450
const PEER_ID: &str = "peer_id";
4551
const PEER_TYPE: &str = "peer_type";
4652
const PEER_ADDR: &str = "peer_addr";
53+
const CPUS: &str = "cpus";
54+
const MEMORY_BYTES: &str = "memory_bytes";
4755
const VERSION: &str = "version";
4856
const GIT_COMMIT: &str = "git_commit";
4957
const START_TIME: &str = "start_time";
5058
const UPTIME: &str = "uptime";
5159
const ACTIVE_TIME: &str = "active_time";
60+
const NODE_STATUS: &str = "node_status";
5261

5362
const INIT_CAPACITY: usize = 42;
5463

@@ -57,11 +66,14 @@ const INIT_CAPACITY: usize = 42;
5766
/// - `peer_id`: the peer server id.
5867
/// - `peer_type`: the peer type, such as `datanode`, `frontend`, `metasrv` etc.
5968
/// - `peer_addr`: the peer gRPC address.
69+
/// - `cpus`: the number of CPUs of the peer.
70+
/// - `memory_bytes`: the memory bytes of the peer.
6071
/// - `version`: the build package version of the peer.
6172
/// - `git_commit`: the build git commit hash of the peer.
6273
/// - `start_time`: the starting time of the peer.
6374
/// - `uptime`: the uptime of the peer.
6475
/// - `active_time`: the time since the last activity of the peer.
76+
/// - `node_status`: the status info of the peer.
6577
///
6678
#[derive(Debug)]
6779
pub(super) struct InformationSchemaClusterInfo {
@@ -82,6 +94,8 @@ impl InformationSchemaClusterInfo {
8294
ColumnSchema::new(PEER_ID, ConcreteDataType::int64_datatype(), false),
8395
ColumnSchema::new(PEER_TYPE, ConcreteDataType::string_datatype(), false),
8496
ColumnSchema::new(PEER_ADDR, ConcreteDataType::string_datatype(), true),
97+
ColumnSchema::new(CPUS, ConcreteDataType::uint32_datatype(), false),
98+
ColumnSchema::new(MEMORY_BYTES, ConcreteDataType::uint64_datatype(), false),
8599
ColumnSchema::new(VERSION, ConcreteDataType::string_datatype(), false),
86100
ColumnSchema::new(GIT_COMMIT, ConcreteDataType::string_datatype(), false),
87101
ColumnSchema::new(
@@ -91,6 +105,7 @@ impl InformationSchemaClusterInfo {
91105
),
92106
ColumnSchema::new(UPTIME, ConcreteDataType::string_datatype(), true),
93107
ColumnSchema::new(ACTIVE_TIME, ConcreteDataType::string_datatype(), true),
108+
ColumnSchema::new(NODE_STATUS, ConcreteDataType::string_datatype(), true),
94109
]))
95110
}
96111

@@ -140,11 +155,14 @@ struct InformationSchemaClusterInfoBuilder {
140155
peer_ids: Int64VectorBuilder,
141156
peer_types: StringVectorBuilder,
142157
peer_addrs: StringVectorBuilder,
158+
cpus: UInt32VectorBuilder,
159+
memory_bytes: UInt64VectorBuilder,
143160
versions: StringVectorBuilder,
144161
git_commits: StringVectorBuilder,
145162
start_times: TimestampMillisecondVectorBuilder,
146163
uptimes: StringVectorBuilder,
147164
active_times: StringVectorBuilder,
165+
node_status: StringVectorBuilder,
148166
}
149167

150168
impl InformationSchemaClusterInfoBuilder {
@@ -155,11 +173,14 @@ impl InformationSchemaClusterInfoBuilder {
155173
peer_ids: Int64VectorBuilder::with_capacity(INIT_CAPACITY),
156174
peer_types: StringVectorBuilder::with_capacity(INIT_CAPACITY),
157175
peer_addrs: StringVectorBuilder::with_capacity(INIT_CAPACITY),
176+
cpus: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
177+
memory_bytes: UInt64VectorBuilder::with_capacity(INIT_CAPACITY),
158178
versions: StringVectorBuilder::with_capacity(INIT_CAPACITY),
159179
git_commits: StringVectorBuilder::with_capacity(INIT_CAPACITY),
160180
start_times: TimestampMillisecondVectorBuilder::with_capacity(INIT_CAPACITY),
161181
uptimes: StringVectorBuilder::with_capacity(INIT_CAPACITY),
162182
active_times: StringVectorBuilder::with_capacity(INIT_CAPACITY),
183+
node_status: StringVectorBuilder::with_capacity(INIT_CAPACITY),
163184
}
164185
}
165186

@@ -176,9 +197,10 @@ impl InformationSchemaClusterInfoBuilder {
176197

177198
fn add_node_info(&mut self, predicates: &Predicates, node_info: NodeInfo) {
178199
let peer_type = node_info.status.role_name();
200+
let peer_id = peer_id(peer_type, node_info.peer.id);
179201

180202
let row = [
181-
(PEER_ID, &Value::from(node_info.peer.id)),
203+
(PEER_ID, &Value::from(peer_id)),
182204
(PEER_TYPE, &Value::from(peer_type)),
183205
(PEER_ADDR, &Value::from(node_info.peer.addr.as_str())),
184206
(VERSION, &Value::from(node_info.version.as_str())),
@@ -189,13 +211,7 @@ impl InformationSchemaClusterInfoBuilder {
189211
return;
190212
}
191213

192-
if peer_type == "FRONTEND" || peer_type == "METASRV" {
193-
// Always set peer_id to be -1 for frontends and metasrvs
194-
self.peer_ids.push(Some(-1));
195-
} else {
196-
self.peer_ids.push(Some(node_info.peer.id as i64));
197-
}
198-
214+
self.peer_ids.push(Some(peer_id));
199215
self.peer_types.push(Some(peer_type));
200216
self.peer_addrs.push(Some(&node_info.peer.addr));
201217
self.versions.push(Some(&node_info.version));
@@ -212,6 +228,8 @@ impl InformationSchemaClusterInfoBuilder {
212228
self.start_times.push(None);
213229
self.uptimes.push(None);
214230
}
231+
self.cpus.push(Some(node_info.cpus));
232+
self.memory_bytes.push(Some(node_info.memory_bytes));
215233

216234
if node_info.last_activity_ts > 0 {
217235
self.active_times.push(Some(
@@ -220,6 +238,8 @@ impl InformationSchemaClusterInfoBuilder {
220238
} else {
221239
self.active_times.push(None);
222240
}
241+
self.node_status
242+
.push(format_node_status(&node_info).as_deref());
223243
}
224244

225245
fn format_duration_since(ts: u64) -> String {
@@ -233,11 +253,14 @@ impl InformationSchemaClusterInfoBuilder {
233253
Arc::new(self.peer_ids.finish()),
234254
Arc::new(self.peer_types.finish()),
235255
Arc::new(self.peer_addrs.finish()),
256+
Arc::new(self.cpus.finish()),
257+
Arc::new(self.memory_bytes.finish()),
236258
Arc::new(self.versions.finish()),
237259
Arc::new(self.git_commits.finish()),
238260
Arc::new(self.start_times.finish()),
239261
Arc::new(self.uptimes.finish()),
240262
Arc::new(self.active_times.finish()),
263+
Arc::new(self.node_status.finish()),
241264
];
242265
RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu)
243266
}
@@ -263,3 +286,56 @@ impl DfPartitionStream for InformationSchemaClusterInfo {
263286
))
264287
}
265288
}
289+
290+
fn peer_id(peer_type: &str, peer_id: u64) -> i64 {
291+
if peer_type == PEER_TYPE_FRONTEND || peer_type == PEER_TYPE_METASRV {
292+
-1
293+
} else {
294+
peer_id as i64
295+
}
296+
}
297+
298+
#[derive(Serialize)]
299+
struct DisplayMetasrvStatus {
300+
is_leader: bool,
301+
}
302+
303+
#[derive(Serialize)]
304+
struct DisplayDatanodeStatus {
305+
workloads: Vec<DatanodeWorkloadType>,
306+
leader_regions: usize,
307+
follower_regions: usize,
308+
}
309+
310+
impl From<&DatanodeStatus> for DisplayDatanodeStatus {
311+
fn from(status: &DatanodeStatus) -> Self {
312+
Self {
313+
workloads: status
314+
.workloads
315+
.types
316+
.iter()
317+
.flat_map(|w| DatanodeWorkloadType::from_i32(*w))
318+
.collect(),
319+
leader_regions: status.leader_regions,
320+
follower_regions: status.follower_regions,
321+
}
322+
}
323+
}
324+
325+
fn format_node_status(node_info: &NodeInfo) -> Option<String> {
326+
match &node_info.status {
327+
NodeStatus::Datanode(datanode_status) => {
328+
serde_json::to_string(&DisplayDatanodeStatus::from(datanode_status)).ok()
329+
}
330+
NodeStatus::Frontend(_) => None,
331+
NodeStatus::Flownode(_) => None,
332+
NodeStatus::Metasrv(metasrv_status) => {
333+
if metasrv_status.is_leader {
334+
serde_json::to_string(&DisplayMetasrvStatus { is_leader: true }).ok()
335+
} else {
336+
None
337+
}
338+
}
339+
NodeStatus::Standalone => None,
340+
}
341+
}

src/cmd/src/standalone.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,10 @@ impl InformationExtension for StandaloneInformationExtension {
787787
// Use `self.start_time_ms` instead.
788788
// It's not precise but enough.
789789
start_time_ms: self.start_time_ms,
790+
cpus: common_config::utils::get_cpus() as u32,
791+
memory_bytes: common_config::utils::get_sys_total_memory()
792+
.unwrap_or_default()
793+
.as_bytes(),
790794
};
791795
Ok(vec![node_info])
792796
}

src/common/config/src/utils.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,24 @@ pub fn get_sys_total_memory() -> Option<ReadableSize> {
3939
}
4040
}
4141

42+
/// `ResourceSpec` holds the static resource specifications of a node,
43+
/// such as CPU cores and memory capacity. These values are fixed
44+
/// at startup and do not change dynamically during runtime.
45+
#[derive(Debug, Clone, Copy)]
46+
pub struct ResourceSpec {
47+
pub cpus: usize,
48+
pub memory: Option<ReadableSize>,
49+
}
50+
51+
impl Default for ResourceSpec {
52+
fn default() -> Self {
53+
Self {
54+
cpus: get_cpus(),
55+
memory: get_sys_total_memory(),
56+
}
57+
}
58+
}
59+
4260
#[cfg(test)]
4361
mod tests {
4462
use super::*;

src/common/meta/src/cluster.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,12 @@ pub struct NodeInfo {
118118
pub git_commit: String,
119119
// The node star timestamp
120120
pub start_time_ms: u64,
121+
// The node build cpus
122+
#[serde(default)]
123+
pub cpus: u32,
124+
// The node build memory bytes
125+
#[serde(default)]
126+
pub memory_bytes: u64,
121127
}
122128

123129
#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize)]
@@ -324,6 +330,8 @@ mod tests {
324330
version: "".to_string(),
325331
git_commit: "".to_string(),
326332
start_time_ms: 1,
333+
cpus: 0,
334+
memory_bytes: 0,
327335
};
328336

329337
let node_info_bytes: Vec<u8> = node_info.try_into().unwrap();

src/datanode/src/heartbeat.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ use std::time::Duration;
2020
use api::v1::meta::heartbeat_request::NodeWorkloads;
2121
use api::v1::meta::{DatanodeWorkloads, HeartbeatRequest, NodeInfo, Peer, RegionRole, RegionStat};
2222
use common_base::Plugins;
23+
use common_config::utils::ResourceSpec;
2324
use common_meta::cache_invalidator::CacheInvalidatorRef;
2425
use common_meta::datanode::REGION_STATISTIC_KEY;
2526
use common_meta::distributed_time_constants::META_KEEP_ALIVE_INTERVAL_SECS;
@@ -62,6 +63,7 @@ pub struct HeartbeatTask {
6263
interval: u64,
6364
resp_handler_executor: HeartbeatResponseHandlerExecutorRef,
6465
region_alive_keeper: Arc<RegionAliveKeeper>,
66+
resource_spec: ResourceSpec,
6567
}
6668

6769
impl Drop for HeartbeatTask {
@@ -104,6 +106,7 @@ impl HeartbeatTask {
104106
interval: opts.heartbeat.interval.as_millis() as u64,
105107
resp_handler_executor,
106108
region_alive_keeper,
109+
resource_spec: Default::default(),
107110
})
108111
}
109112

@@ -231,6 +234,8 @@ impl HeartbeatTask {
231234

232235
self.region_alive_keeper.start(Some(event_receiver)).await?;
233236
let mut last_sent = Instant::now();
237+
let cpus = self.resource_spec.cpus as u32;
238+
let memory_bytes = self.resource_spec.memory.unwrap_or_default().as_bytes();
234239

235240
common_runtime::spawn_hb(async move {
236241
let sleep = tokio::time::sleep(Duration::from_millis(0));
@@ -244,7 +249,8 @@ impl HeartbeatTask {
244249
version: build_info.version.to_string(),
245250
git_commit: build_info.commit_short.to_string(),
246251
start_time_ms: node_epoch,
247-
cpus: num_cpus::get() as u32,
252+
cpus,
253+
memory_bytes,
248254
}),
249255
node_workloads: Some(NodeWorkloads::Datanode(DatanodeWorkloads {
250256
types: workload_types.iter().map(|w| w.to_i32()).collect(),

0 commit comments

Comments
 (0)