Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
prometheus/alertmanager/conf/prod.alertmanager.yml
htpasswd
7 changes: 7 additions & 0 deletions prometheus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Services:
* node-exporter (host metrics collector)
* cadvisor (containers metrics collector)
* dockerd-exporter (Docker daemon metrics collector, requires Docker experimental metrics-addr to be enabled)
* blackbox-exporter (probe arbitrary http/https/dns/icmp/ssh endpoints to monitor status)
* alertmanager (alerts dispatcher) `http://<swarm-ip>:9093`

### custom multiarch node-exporter
Expand Down Expand Up @@ -96,3 +97,9 @@ Assuming the filter is `$node_id` the container count query should look like thi
```
count(rate(container_last_seen{container_label_com_docker_swarm_node_id=~"$node_id"}[5m]))
```

## Additional Alerting and PromQL resources
* A decent list of alerts for a variety to prometheus exporters is available:
https://awesome-prometheus-alerts.grep.to/rules.html
* PromQL documentation: https://prometheus.io/docs/prometheus/latest/querying/basics/

7 changes: 7 additions & 0 deletions prometheus/blackbox-exporter/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# blackbox-exporter
Trialing usage of blackbox-exporter within my monitoring setup. I am using `prom/blackbox-exporter` to expose a probe endpoint that prometheus can 'scrape'. For the trial, I will be probing CloudFlare's DoH (DNS over HTTPS) status endpoint to monitor that DNS requests are indeed routed via CF DoH thru my pihole setup.

## Usage
* URL to check current DoH status: https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest
* URL for Prometheus to scrape: http://blackbox-exporter:9115/probe?target=bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest&module=http_2xx_expect_1

38 changes: 38 additions & 0 deletions prometheus/blackbox-exporter/blackbox.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# http://localhost:9115/probe?target=https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-dot.cloudflareresolve.com/resolvertest&module=http_2xx
# http://localhost:9115/probe?target=https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-dot.cloudflareresolve.com/resolvertest&module=http_2xx_example
modules:
http_2xx_expect_1:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
valid_status_codes: [] # Defaults to 2xx
method: GET
# headers:
# Host: vhost.example.com
# Accept-Language: en-US
# Origin: example.com
# no_follow_redirects: false
# fail_if_ssl: false
# fail_if_not_ssl: false
# fail_if_body_matches_regexp:
# - "Could not connect to database"
fail_if_body_not_matches_regexp:
- "^1$"
preferred_ip_protocol: "ip4" # defaults to "ip6"
ip_protocol_fallback: false # no fallback to "ip6"
# fail_if_header_matches: # Verifies that no cookies are set
# - header: Set-Cookie
# allow_missing: true
# regexp: '.*'
# fail_if_header_not_matches:
# - header: Access-Control-Allow-Origin
# regexp: '(\*|example\.com)'
# tls_config:
# insecure_skip_verify: false
# preferred_ip_protocol: "ip4" # defaults to "ip6"
# ip_protocol_fallback: false # no fallback to "ip6"



# curl 'https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest' -H 'Accept: */*' -H 'Referer: https://www.cloudflare.com/ssl/encrypted-sni/' -H 'Origin: https://www.cloudflare.com' -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36' -H 'DNT: 1' -H 'Sec-Fetch-Mode: cors' --compressed
99 changes: 92 additions & 7 deletions prometheus/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ networks:
net:
driver: overlay
attachable: true
# traefik-public:
# driver: overlay
# external: true

volumes:
prometheus: {}
Expand All @@ -29,8 +32,72 @@ configs:
file: ./alertmanager/conf/prod.alertmanager.yml
prometheus:
file: ./prometheus/conf/prometheus.yml
blackbox_exporter:
file: ./blackbox-exporter/blackbox.yml
traefik_htpasswd:
file: ./traefik/htpasswd



services:
traefik:
image: traefik:v1.7.19-alpine
ports:
- target: 80
published: 8080
# mode: host
- target: 443
published: 4433
# mode: host
- target: 8080
published: 8181
# you can view the traefik's dashboard in http://swarmIP:8080
command: >
--api
--docker
--docker.swarmMode
--docker.domain=${DOMAIN:-docker.localhost}
--docker.watch
--defaultentrypoints=http
--entrypoints='Name:http Address::80'
--entrypoints='Name:https Address::443 TLS'
--logLevel=INFO
--accessLog
--metrics
--metrics.prometheus
volumes:
- /var/run/docker.sock:/var/run/docker.sock
configs:
- source: traefik_htpasswd
target: /etc/htpasswd
networks:
# - traefik-public
- net
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
update_config:
parallelism: 1
delay: 10s
doh-status-probe:
image: prom/blackbox-exporter
networks:
- net
command:
- '--config.file=/config/blackbox.yml'
configs:
- source: blackbox_exporter
target: /config/blackbox.yml
deploy:
mode: replicated
replicas: 1
resources:
limits:
memory: 32M

dockerd-exporter:
image: jmb12686/socat
networks:
Expand Down Expand Up @@ -92,14 +159,18 @@ services:
target: /etc/grafana/dashboards/swarmprom-prometheus-dash.json
- source: grafana_dashboards_services
target: /etc/grafana/dashboards/swarmprom-services-dash.json
ports:
- 3000:3000
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
labels:
- "traefik.backend=grafana"
- "traefik.port=3000"
- "traefik.docker.network=mon_net"
- "traefik.entrypoints=http"
- "traefik.frontend.rule=Host:grafana.home.local"
resources:
limits:
memory: 128M
Expand All @@ -121,8 +192,6 @@ services:
- "--web.external-url=http://raspi-swarm.home.local:9093"
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
ports:
- 9093:9093
volumes:
- alertmanager:/alertmanager
deploy:
Expand All @@ -131,6 +200,12 @@ services:
placement:
constraints:
- node.role == manager
labels:
- "traefik.backend=alertmanager"
- "traefik.port=9093"
- "traefik.docker.network=mon_net"
- "traefik.entrypoints=http"
- "traefik.frontend.rule=Host:alertmanager.home.local"
resources:
limits:
memory: 128M
Expand Down Expand Up @@ -169,8 +244,10 @@ services:
- net
command:
- '--config.file=/etc/prometheus/prometheus.yml'
# - '--web.route-prefix=/prometheus/'
# - '--web.external-url=http://raspi-swarm.home.local/prometheus'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention=${PROMETHEUS_RETENTION:-24h}'
- '--storage.tsdb.retention=${PROMETHEUS_RETENTION:-72h}'
volumes:
- prometheus:/prometheus
configs:
Expand All @@ -180,14 +257,22 @@ services:
target: /etc/prometheus/swarm_node.rules.yml
- source: task_rules
target: /etc/prometheus/swarm_task.rules.yml
ports:
- 9090:9090
- source: traefik_htpasswd
target: /etc/htpasswd
deploy:
mode: replicated
replicas: 1
placement:
constraints:
- node.role == manager
labels:
- "traefik.backend=prometheus"
- "traefik.port=9090" #Prometheus port to register
- "traefik.docker.network=mon_net"
# - "traefik.backend=net"
- "traefik.entrypoints=http"
- "traefik.frontend.rule=Host:prometheus.home.local"
# - "traefik.frontend.auth.basic.usersFile=/etc/htpasswd"
resources:
limits:
memory: 2048M
Expand Down
17 changes: 17 additions & 0 deletions prometheus/prometheus/conf/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,21 @@ scrape_configs:
type: 'A'
port: 9100

- job_name: 'doh-status-probe'
metrics_path: /probe
params:
module: [http_2xx_expect_1] # Look for a HTTP 200 response and expect '1' in body.
static_configs:
- targets:
# - http://prometheus.io # Target to probe with http.
- https://bbd96f23-eda8-465d-b190-6ddf056cae66.is-doh.cloudflareresolve.com/resolvertest # Target to probe with https.
# - http://example.com:8080 # Target to probe with http on port 8080.
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: doh-status-probe:9115 # The blackbox exporter's real hostname:port.


6 changes: 3 additions & 3 deletions prometheus/prometheus/rules/swarm_node.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ groups:
##count(count(container_tasks_state{container_label_com_docker_swarm_node_id =~".+"}) by (container_label_com_docker_swarm_node_id)) < 3
##count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~".+"}) < 3
- alert: piholeMissing
expr: absent(((time() - container_last_seen{name=~"pihole_pihole.*"}) < 5))
expr: (count(time() - container_last_seen{name=~".*pihole_pihole.*"} < 30) OR vector(0)) < 2
for: 5s
labels:
severity: error
annotations:
description: pihole service containers are missing and may be down. Check status of home DNS setup!
summary: pihole containers are missing
description: Less than 2 pihole containers exist! Check status of home DNS setup!
summary: pihole container missing
- alert: less_than_3_nodes
expr: count(node_meta * on(instance) group_left(node_name) node_meta{node_id=~".+"}) < 3
for: 5m
Expand Down
18 changes: 17 additions & 1 deletion prometheus/prometheus/rules/swarm_task.rules.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
groups:
- name: /1/store/projects/vagrant/docker-swarm-vagrant/apps/swarmprom/prometheus/rules/swarm_task.rules.yml
- name: swarm_task.rules.yml
rules:
- alert: ProbeFailed
expr: probe_success == 0
for: 3m
labels:
severity: error
annotations:
summary: "Probe failed {{ $labels.job }} (instance {{ $labels.instance }})"
description: "Probe failed\n VALUE = {{ $value }}\n JOB: {{ $labels.job }}\n LABELS: {{ $labels }}"
- alert: ExporterDown
expr: up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Exporter down {{ $labels.job }} (instance {{ $labels.instance }})"
description: "Prometheus exporter down\n VALUE = {{ $value }}\n JOB: {{ $labels.job }}\n LABELS: {{ $labels }}"
- alert: task_high_cpu_usage_50
expr: sum(rate(container_cpu_usage_seconds_total{container_label_com_docker_swarm_task_name=~".+"}[1m]))
BY (container_label_com_docker_swarm_task_name, container_label_com_docker_swarm_node_id)
Expand Down