Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,41 +48,41 @@ func NewAmazonEfaCollector(logger *slog.Logger) (Collector, error) {

// Detailed description for all metrics.
descriptions := map[string]string{
"alloc_pd_err": "Number of allocations PD errors",
"alloc_ucontext_err": "Number of allocations UContext errors",
"cmds_err": "Number of commands errors",
"completed_cmds": "Number of completed commands",
"create_ah_err": "Number of create AH errors",
"create_cq_err": "Number of create CQ errors",
"create_qp_err": "Number of create qp errors",
"impaired_remote_conn_events": "Number of EFA SRD connections entered an impaired state, resulting in a reduced throughput rate limit.",
"keep_alive_rcvd": "Number of keep-alive packets received",
"lifespan": "Lifespan of the port",
"mmap_err": "Number of mmap errors",
"no_completion_cmds": "Number of commands with no completion",
"rdma_read_bytes": "Number of bytes read with RDMA",
"rdma_read_resp_bytes": "Number of read reponses bytes with RDMA",
"rdma_read_wr_err": "Number of read write errors with RDMA",
"rdma_read_wrs": "Number of read rs with RDMA",
"rdma_write_bytes": "Number of bytes wrote with RDMA",
"rdma_write_recv_bytes": "Number of bytes wrote and received with RDMA",
"rdma_write_wr_err": "Number of bytes wrote wr with error RDMA",
"rdma_write_wrs": "Number of bytes wrote wrs RDMA",
"recv_bytes": "Number of bytes recv bytes",
"recv_wrs": "Number of bytes recv wrs",
"reg_mr_err": "Number of reg_mr errors",
"retrans_bytes": "Number of efa_srd bytes retransmitted",
"retrans_pkts": "Number of efa_srd packets retransmitted",
"retrans_timeout_events": "Number of times SRD traffic reached timeout and required network path change",
"rx_bytes": "Number of bytes received",
"rx_drops": "Number of packets droped",
"rx_pkts": "Number of packets received",
"send_bytes": "Number of bytes send",
"send_wrs": "Number of wrs send",
"submitted_cmds": "Number of submitted commands",
"tx_bytes": "Number of bytes transmitted",
"tx_pkts": "Number of packets transmitted",
"unresponsive_remote_events": "Number of times SRD connection remote was unresponsive",
"alloc_pd_err": "Number of allocations PD errors",
"alloc_ucontext_err": "Number of allocations UContext errors",
"cmds_err": "Number of commands errors",
"completed_cmds": "Number of completed commands",
"create_ah_err": "Number of create AH errors",
"create_cq_err": "Number of create CQ errors",
"create_qp_err": "Number of create qp errors",
"impaired_remote_conn_events": "Number of EFA SRD connections entered an impaired state, resulting in a reduced throughput rate limit.",
"keep_alive_rcvd": "Number of keep-alive packets received",
"lifespan": "Lifespan of the port",
"mmap_err": "Number of mmap errors",
"no_completion_cmds": "Number of commands with no completion",
"rdma_read_bytes": "Number of bytes read with RDMA",
"rdma_read_resp_bytes": "Number of read responses bytes with RDMA",
"rdma_read_wr_err": "Number of read write errors with RDMA",
"rdma_read_wrs": "Number of read rs with RDMA",
"rdma_write_bytes": "Number of bytes wrote with RDMA",
"rdma_write_recv_bytes": "Number of bytes wrote and received with RDMA",
"rdma_write_wr_err": "Number of bytes wrote wr with error RDMA",
"rdma_write_wrs": "Number of bytes wrote wrs RDMA",
"recv_bytes": "Number of bytes recv bytes",
"recv_wrs": "Number of bytes recv wrs",
"reg_mr_err": "Number of reg_mr errors",
"retrans_bytes": "Number of efa_srd bytes retransmitted",
"retrans_pkts": "Number of efa_srd packets retransmitted",
"retrans_timeout_events": "Number of times SRD traffic reached timeout and required network path change",
"rx_bytes": "Number of bytes received",
"rx_drops": "Number of packets dropped",
"rx_pkts": "Number of packets received",
"send_bytes": "Number of bytes send",
"send_wrs": "Number of wrs send",
"submitted_cmds": "Number of submitted commands",
"tx_bytes": "Number of bytes transmitted",
"tx_pkts": "Number of packets transmitted",
"unresponsive_remote_events": "Number of times SRD connection remote was unresponsive",
}

i.metricDescs = make(map[string]*prometheus.Desc)
Expand Down
138 changes: 81 additions & 57 deletions 4.validation_and_observability/3.efa-node-exporter/class_amazon_efa.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,64 +257,88 @@ func parseAmazonEfaCounters(portPath string) (*AmazonEfaCounters, error) {
//vp := util.NewValueParser(value)

switch f.Name() {
case "impaired_remote_conn_events":
counters.ImpairedRemoteConnEvents, err = parseUInt64(value)
case "lifespan":
counters.Lifespan, err = parseUInt64(value)
case "rdma_read_bytes":
counters.RdmaReadBytes, err = parseUInt64(value)
case "rdma_read_resp_bytes":
counters.RdmaReadRespBytes, err = parseUInt64(value)
case "rdma_read_wr_err":
counters.RdmaReadWrErr, err = parseUInt64(value)
case "rdma_read_wrs":
counters.RdmaReadWrs, err = parseUInt64(value)
case "rdma_write_bytes":
counters.RdmaWriteBytes, err = parseUInt64(value)
case "rdma_write_recv_bytes":
counters.RdmaWriteRecvBytes, err = parseUInt64(value)
case "rdma_write_wr_err":
counters.RdmaWriteWrErr, err = parseUInt64(value)
case "rdma_write_wrs":
counters.RdmaWriteWrs, err = parseUInt64(value)
case "recv_bytes":
counters.RecvBytes, err = parseUInt64(value)
case "recv_wrs":
counters.RecvWrs, err = parseUInt64(value)
case "retrans_bytes":
counters.RetransBytes, err = parseUInt64(value)
case "retrans_pkts":
counters.RetransPkts, err = parseUInt64(value)
case "retrans_timeout_events":
counters.RetransTimeoutEvents, err = parseUInt64(value)
case "rx_bytes":
counters.RxBytes, err = parseUInt64(value)
case "rx_drops":
counters.RxDrops, err = parseUInt64(value)
case "rx_pkts":
counters.RxPkts, err = parseUInt64(value)
case "send_bytes":
counters.SendBytes, err = parseUInt64(value)
case "send_wrs":
counters.SendWrs, err = parseUInt64(value)
case "tx_bytes":
counters.TxBytes, err = parseUInt64(value)
case "tx_pkts":
counters.TxPkts, err = parseUInt64(value)
case "unresponsive_remote_events":
counters.UnresponsiveRemoteEvents, err = parseUInt64(value)

if err != nil {
// Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966
// when counters are `N/A (not available)`.
// This was already patched and submitted, see
// https://www.spinics.net/lists/linux-rdma/msg68596.html
// Remove this as soon as the fix lands in the enterprise distros.
if strings.Contains(value, "N/A (no PMA)") {
continue
}
return nil, err
case "alloc_pd_err":
counters.AllocPdErr, err = parseUInt64(value)
case "alloc_ucontext_err":
counters.AllocUcontextErr, err = parseUInt64(value)
case "cmds_err":
counters.CmdsErr, err = parseUInt64(value)
case "completed_cmds":
counters.CompletedCmds, err = parseUInt64(value)
case "create_ah_err":
counters.CreateAhErr, err = parseUInt64(value)
case "create_cq_err":
counters.CreateCqErr, err = parseUInt64(value)
case "create_qp_err":
counters.CreateQpErr, err = parseUInt64(value)
case "impaired_remote_conn_events":
counters.ImpairedRemoteConnEvents, err = parseUInt64(value)
case "keep_alive_rcvd":
counters.KeepAliveRcvd, err = parseUInt64(value)
case "lifespan":
counters.Lifespan, err = parseUInt64(value)
case "mmap_err":
counters.MmapErr, err = parseUInt64(value)
case "no_completion_cmds":
counters.NoCompletionCmds, err = parseUInt64(value)
case "rdma_read_bytes":
counters.RdmaReadBytes, err = parseUInt64(value)
case "rdma_read_resp_bytes":
counters.RdmaReadRespBytes, err = parseUInt64(value)
case "rdma_read_wr_err":
counters.RdmaReadWrErr, err = parseUInt64(value)
case "rdma_read_wrs":
counters.RdmaReadWrs, err = parseUInt64(value)
case "rdma_write_bytes":
counters.RdmaWriteBytes, err = parseUInt64(value)
case "rdma_write_recv_bytes":
counters.RdmaWriteRecvBytes, err = parseUInt64(value)
case "rdma_write_wr_err":
counters.RdmaWriteWrErr, err = parseUInt64(value)
case "rdma_write_wrs":
counters.RdmaWriteWrs, err = parseUInt64(value)
case "recv_bytes":
counters.RecvBytes, err = parseUInt64(value)
case "recv_wrs":
counters.RecvWrs, err = parseUInt64(value)
case "reg_mr_err":
counters.RegMrErr, err = parseUInt64(value)
case "retrans_bytes":
counters.RetransBytes, err = parseUInt64(value)
case "retrans_pkts":
counters.RetransPkts, err = parseUInt64(value)
case "retrans_timeout_events":
counters.RetransTimeoutEvents, err = parseUInt64(value)
case "rx_bytes":
counters.RxBytes, err = parseUInt64(value)
case "rx_drops":
counters.RxDrops, err = parseUInt64(value)
case "rx_pkts":
counters.RxPkts, err = parseUInt64(value)
case "send_bytes":
counters.SendBytes, err = parseUInt64(value)
case "send_wrs":
counters.SendWrs, err = parseUInt64(value)
case "submitted_cmds":
counters.SubmittedCmds, err = parseUInt64(value)
case "tx_bytes":
counters.TxBytes, err = parseUInt64(value)
case "tx_pkts":
counters.TxPkts, err = parseUInt64(value)
case "unresponsive_remote_events":
counters.UnresponsiveRemoteEvents, err = parseUInt64(value)
}

if err != nil {
// Ugly workaround for handling https://github.com/prometheus/node_exporter/issues/966
// when counters are `N/A (not available)`.
// This was already patched and submitted, see
// https://www.spinics.net/lists/linux-rdma/msg68596.html
// Remove this as soon as the fix lands in the enterprise distros.
if strings.Contains(value, "N/A (no PMA)") {
continue
}
return nil, fmt.Errorf("failed to parse counter %s with value %q: %w", f.Name(), value, err)
}
}

Expand Down