Skip to content

Commit f16696b

Browse files
committed
Collect operator autopilot health metrics
1 parent 409cb07 commit f16696b

File tree

2 files changed

+62
-7
lines changed

2 files changed

+62
-7
lines changed

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ make
1919
| ----------------------------------- | ---------------------------------------------------------------------------------------------------- | --------------------------------------------- |
2020
| consul_up | Was the last query of Consul successful | |
2121
| consul_raft_peers | How many peers (servers) are in the Raft cluster | |
22+
| consul_operator_autopilot_server_health | The status of each servers health from a server cluster perspective | server_id, server_name, server_address, server_health_metric |
2223
| consul_serf_lan_members | How many members are in the cluster | |
2324
| consul_serf_lan_member_status | Status of member in the cluster. 1=Alive, 2=Leaving, 3=Left, 4=Failed. | member |
2425
| consul_catalog_services | How many services are in the cluster | |
@@ -75,6 +76,16 @@ against the actual value found via monitoring.
7576
A prefix must be supplied to activate this feature. Pass `/` if you want to
7677
search the entire keyspace.
7778

79+
#### Operator Autopilot Server Health
80+
81+
This exporter allows gathering low-level server metrics through the
82+
Operator APIs Autopilot Health endpoint. This is a greatly elevated
83+
endpoint that requires `operator:read`, and so should only be used
84+
with a restricted ACL in a trusted fashion.
85+
86+
* __`operator.autopilot-server-health`:__ Collects low-level server metrics
87+
from the v1/operator/autopilot/health endpoint.
88+
7889
### Environment variables
7990

8091
The consul\_exporter supports all environment variables provided by the official

consul_exporter.go

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ var (
5858
"Does Raft cluster have a leader (according to this node).",
5959
nil, nil,
6060
)
61+
operatorAutopilotServerHealth = prometheus.NewDesc(
62+
prometheus.BuildFQName(namespace, "", "operator_autopilot_server_health"),
63+
"The status of each servers health from a server cluster perspective",
64+
[]string{"server_id", "server_name", "server_address", "server_health_metric"}, nil,
65+
)
6166
nodeCount = prometheus.NewDesc(
6267
prometheus.BuildFQName(namespace, "", "serf_lan_members"),
6368
"How many members are in the cluster.",
@@ -121,6 +126,7 @@ type Exporter struct {
121126
kvPrefix string
122127
kvFilter *regexp.Regexp
123128
healthSummary bool
129+
operatorHealth bool
124130
logger log.Logger
125131
requestLimitChan chan struct{}
126132
}
@@ -137,7 +143,7 @@ type consulOpts struct {
137143
}
138144

139145
// NewExporter returns an initialized Exporter.
140-
func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, logger log.Logger) (*Exporter, error) {
146+
func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool, operatorHealth bool, logger log.Logger) (*Exporter, error) {
141147
uri := opts.uri
142148
if !strings.Contains(uri, "://") {
143149
uri = "http://" + uri
@@ -188,6 +194,7 @@ func NewExporter(opts consulOpts, kvPrefix, kvFilter string, healthSummary bool,
188194
kvPrefix: kvPrefix,
189195
kvFilter: regexp.MustCompile(kvFilter),
190196
healthSummary: healthSummary,
197+
operatorHealth: operatorHealth,
191198
logger: logger,
192199
requestLimitChan: requestLimitChan,
193200
}, nil
@@ -199,6 +206,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
199206
ch <- up
200207
ch <- clusterServers
201208
ch <- clusterLeader
209+
ch <- operatorAutopilotServerHealth
202210
ch <- nodeCount
203211
ch <- memberStatus
204212
ch <- serviceCount
@@ -215,6 +223,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
215223
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
216224
ok := e.collectPeersMetric(ch)
217225
ok = e.collectLeaderMetric(ch) && ok
226+
ok = e.collectOperatorAutopilotServerHealthMetric(ch) && ok
218227
ok = e.collectNodesMetric(ch) && ok
219228
ok = e.collectMembersMetric(ch) && ok
220229
ok = e.collectServicesMetric(ch) && ok
@@ -262,6 +271,40 @@ func (e *Exporter) collectLeaderMetric(ch chan<- prometheus.Metric) bool {
262271
return true
263272
}
264273

274+
func (e *Exporter) collectOperatorAutopilotServerHealthMetric(ch chan<- prometheus.Metric) bool {
275+
if e.operatorHealth == false {
276+
return true
277+
}
278+
clusterHealth, err := e.client.Operator().AutopilotServerHealth(&queryOptions)
279+
if err != nil {
280+
level.Error(e.logger).Log("msg", "Failed to get autopilot server health", "err", err)
281+
return false
282+
}
283+
for _, server := range clusterHealth.Servers {
284+
ch <- prometheus.MustNewConstMetric(
285+
operatorAutopilotServerHealth, prometheus.CounterValue, float64(server.LastIndex), server.ID, server.Name, server.Address, "LastIndex",
286+
)
287+
ch <- prometheus.MustNewConstMetric(
288+
operatorAutopilotServerHealth, prometheus.CounterValue, float64(server.LastTerm), server.ID, server.Name, server.Address, "LastTerm",
289+
)
290+
server_health := 0.0
291+
if server.Healthy == true {
292+
server_health = 1.0
293+
}
294+
ch <- prometheus.MustNewConstMetric(
295+
operatorAutopilotServerHealth, prometheus.CounterValue, server_health, server.ID, server.Name, server.Address, "Health",
296+
)
297+
server_voter := 0.0
298+
if server.Voter == true {
299+
server_voter = 1.0
300+
}
301+
ch <- prometheus.MustNewConstMetric(
302+
operatorAutopilotServerHealth, prometheus.CounterValue, server_voter, server.ID, server.Name, server.Address, "Voter",
303+
)
304+
}
305+
return true
306+
}
307+
265308
func (e *Exporter) collectNodesMetric(ch chan<- prometheus.Metric) bool {
266309
nodes, _, err := e.client.Catalog().Nodes(&queryOptions)
267310
if err != nil {
@@ -456,11 +499,12 @@ func init() {
456499

457500
func main() {
458501
var (
459-
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String()
460-
metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String()
461-
healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool()
462-
kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String()
463-
kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String()
502+
listenAddress = kingpin.Flag("web.listen-address", "Address to listen on for web interface and telemetry.").Default(":9107").String()
503+
metricsPath = kingpin.Flag("web.telemetry-path", "Path under which to expose metrics.").Default("/metrics").String()
504+
healthSummary = kingpin.Flag("consul.health-summary", "Generate a health summary for each service instance. Needs n+1 queries to collect all information.").Default("true").Bool()
505+
kvPrefix = kingpin.Flag("kv.prefix", "Prefix from which to expose key/value pairs.").Default("").String()
506+
kvFilter = kingpin.Flag("kv.filter", "Regex that determines which keys to expose.").Default(".*").String()
507+
operatorHealth = kingpin.Flag("operator.autopilot-server-health", "Collect all operator autopilot server health").Default("true").Bool()
464508

465509
opts = consulOpts{}
466510
)
@@ -486,7 +530,7 @@ func main() {
486530
level.Info(logger).Log("msg", "Starting consul_exporter", "version", version.Info())
487531
level.Info(logger).Log("build_context", version.BuildContext())
488532

489-
exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, logger)
533+
exporter, err := NewExporter(opts, *kvPrefix, *kvFilter, *healthSummary, *operatorHealth, logger)
490534
if err != nil {
491535
level.Error(logger).Log("msg", "Error creating the exporter", "err", err)
492536
os.Exit(1)

0 commit comments

Comments
 (0)