diff --git a/misc/grafana/.gitignore b/misc/grafana/.gitignore new file mode 100644 index 000000000..48b8bf907 --- /dev/null +++ b/misc/grafana/.gitignore @@ -0,0 +1 @@ +vendor/ diff --git a/misc/grafana/README.md b/misc/grafana/README.md new file mode 100644 index 000000000..36b16a074 --- /dev/null +++ b/misc/grafana/README.md @@ -0,0 +1,34 @@ +# Grafana dashboard + +This directory contains a Grafana dashboard for monitoring matrix-authentication-service. + +It is defined using [jsonnet] and the [grafonnet] library. + +## Usage + +The built dashboard is available at `dashboard.json`. +Import it into Grafana to start using it. + +## Development and customization + +Requirements: + +- [go-jsonnet] +- [jsonnet-bundler] + +First install the dependencies using [jsonnet-bundler]: + +```sh +jb install +``` + +Regenerate the dashboard using [go-jsonnet]: + +```sh +jsonnet -J vendor -o dashboard.json dashboard.libsonnet +``` + +[jsonnet]: https://jsonnet.org/ +[go-jsonnet]: https://github.com/google/go-jsonnet +[grafonnet]: https://github.com/grafana/grafonnet +[jsonnet-bundler]: https://github.com/jsonnet-bundler/jsonnet-bundler diff --git a/misc/grafana/dashboard.json b/misc/grafana/dashboard.json new file mode 100644 index 000000000..c5e4da6ca --- /dev/null +++ b/misc/grafana/dashboard.json @@ -0,0 +1,1190 @@ +{ + "description": "A dashboard for monitoring matrix-authentication-service", + "graphTooltip": 1, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [ ], + "title": "Deployment", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "stacking": { + "mode": "percent" + } + } + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (service_version) (target_info{job=~\"$job\", instance=~\"$instance\"})", + "legendFormat": "{{service_version}}" + } + ], + "title": "Service Version", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 100 + }, + "id": 3, + "panels": [ + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 4, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(tokio_runtime_workers{job=~\"$job\", instance=~\"$instance\"})" + } + ], + "title": "Workers", + "type": "stat" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 5, + "options": { + "calculate": false, + "cellGap": 0, + "yAxis": { + "unit": "ns" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (le) (irate(tokio_runtime_worker_poll_time_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "format": "heatmap" + } + ], + "title": "Tick Time", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "min": 0, + "unit": "ns" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (irate(tokio_runtime_worker_poll_time_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "P99", + "refId": "P99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (irate(tokio_runtime_worker_poll_time_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "P95", + "refId": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum by (le) (irate(tokio_runtime_worker_poll_time_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "P90", + "refId": "P90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (irate(tokio_runtime_worker_poll_time_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])))", + "legendFormat": "P50", + "refId": "P50" + } + ], + "title": "Tick Time Percentile", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "ops" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 7, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(irate(tokio_runtime_worker_polls_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Polls" + } + ], + "title": "Poll Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "tasks" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 8, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(tokio_runtime_alive_tasks{job=~\"$job\", instance=~\"$instance\"})", + "legendFormat": "Active Tasks" + } + ], + "title": "Active Tasks", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "ms" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 9, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(irate(tokio_runtime_worker_busy_duration_milliseconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "legendFormat": "Worker busy Time" + } + ], + "title": "Busy Time", + "type": "timeseries" + } + ], + "title": "Tokio", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 200 + }, + "id": 10, + "panels": [ ], + "title": "HTTP server", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 201 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (http_request_method, http_route, http_response_status_code )\n(irate(http_server_duration_count{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{http_request_method}} {{http_route}} {{http_response_status_code}}" + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 201 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean" + ], + "displayMode": "table", + "placement": "right" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (http_request_method, http_route, http_response_status_code)\n(irate(http_server_duration_count{job=~\"$job\", instance=~\"$instance\", http_response_status_code=~\"4..|5..\"}[$__rate_interval]))\n", + "legendFormat": "{{http_request_method}} {{http_route}} {{http_response_status_code}}" + } + ], + "title": "Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 209 + }, + "id": 13, + "options": { + "calculate": false, + "cellGap": 0, + "yAxis": { + "unit": "ms" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route!=\"/oauth2/introspect\"}[$__rate_interval])\n)\n", + "format": "heatmap" + } + ], + "title": "Request latency (excluding introspection)", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "min": 0, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 209 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route!=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P99", + "refId": "P99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route!=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P95", + "refId": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route!=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P90", + "refId": "P90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route!=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P50", + "refId": "P50" + } + ], + "title": "Request latency percentiles (excluding introspection)", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 217 + }, + "id": 15, + "options": { + "calculate": false, + "cellGap": 0, + "yAxis": { + "unit": "ms" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route=\"/oauth2/introspect\"}[$__rate_interval])\n)\n", + "format": "heatmap" + } + ], + "title": "Request latency (introspection only)", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "min": 0, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 217 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P99", + "refId": "P99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P95", + "refId": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P90", + "refId": "P90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (\n irate(http_server_duration_bucket{job=~\"$job\", instance=~\"$instance\", http_route=\"/oauth2/introspect\"}[$__rate_interval])\n))\n", + "legendFormat": "P50", + "refId": "P50" + } + ], + "title": "Request latency percentiles (introspection only)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 300 + }, + "id": 17, + "panels": [ ], + "title": "Database", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 301 + }, + "id": 18, + "options": { + "calculate": false, + "cellGap": 0, + "yAxis": { + "unit": "ms" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (le) (\n irate(db_client_connections_create_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n)\n", + "format": "heatmap" + } + ], + "title": "Database connection acquisition latency", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "min": 0, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 301 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (\n irate(db_client_connections_create_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (\n irate(db_client_connections_create_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum by (le) (\n irate(db_client_connections_create_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (\n irate(db_client_connections_create_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P50" + } + ], + "title": "Database connection acquisition latency percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 100, + "lineWidth": 0, + "stacking": { + "mode": "normal" + } + }, + "min": 0 + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 301 + }, + "id": 20, + "options": { + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (state) (db_connections_usage{job=~\"$job\", instance=~\"$instance\"})\n", + "legendFormat": "Connection {{state}}" + } + ], + "title": "Database connection pool usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 400 + }, + "id": 21, + "panels": [ ], + "title": "Jobs", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 401 + }, + "id": 22, + "options": { + "calculate": false, + "cellGap": 0, + "yAxis": { + "unit": "ms" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (le) (\n irate(job_process_duration_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n)\n", + "format": "heatmap" + } + ], + "title": "Job run duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 401 + }, + "id": 23, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le, job_queue_name) (\n irate(job_process_duration_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "{{job_queue_name}}", + "refId": "P95" + } + ], + "title": "Job run P95 per type", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 401 + }, + "id": 24, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by(job_queue_name, job_result)\n(irate(job_process_duration_milliseconds_count{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "legendFormat": "{{job_queue_name}} {{job_result}}" + } + ], + "title": "Job runs", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 409 + }, + "id": 25, + "options": { + "calculate": false, + "cellGap": 0, + "yAxis": { + "unit": "ms" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (le) (\n irate(job_worker_tick_duration_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n)\n", + "format": "heatmap" + } + ], + "title": "Worker tick duration", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "min": 0, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 409 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (\n irate(job_worker_tick_duration_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P99", + "refId": "P99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (\n irate(job_worker_tick_duration_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P95", + "refId": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum by (le) (\n irate(job_worker_tick_duration_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P90", + "refId": "P90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (\n irate(job_worker_tick_duration_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P50", + "refId": "P50" + } + ], + "title": "Worker tick duration percentiles", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 500 + }, + "id": 27, + "panels": [ ], + "title": "Activity tracker", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "min": 0, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 501 + }, + "id": 28, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by(session_kind)\n(irate(mas_activity_tracker_messages_total{job=~\"$job\", instance=~\"$instance\", type=\"record\"}[$__rate_interval]))\n", + "legendFormat": "{{session_kind}}" + } + ], + "title": "Record rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 501 + }, + "id": 29, + "options": { + "calculate": false, + "cellGap": 0, + "yAxis": { + "unit": "ms" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (le) (\n irate(mas_activity_tracker_flush_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n)\n", + "format": "heatmap" + } + ], + "title": "Flush time", + "type": "heatmap" + }, + { + "datasource": { + "type": "datasource", + "uid": "-- Mixed --" + }, + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "min": 0, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 501 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "mean" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum by (le) (\n irate(mas_activity_tracker_flush_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P99", + "refId": "P99" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum by (le) (\n irate(mas_activity_tracker_flush_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P95", + "refId": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.90, sum by (le) (\n irate(mas_activity_tracker_flush_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P90", + "refId": "P90" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum by (le) (\n irate(mas_activity_tracker_flush_time_milliseconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n))\n", + "legendFormat": "P50", + "refId": "P50" + } + ], + "title": "Flush time percentiles", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "templating": { + "list": [ + { + "includeAll": false, + "multi": false, + "name": "datasource", + "query": "prometheus", + "refresh": 1, + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "multi": false, + "name": "job", + "query": "label_values(target_info, job)", + "refresh": 1, + "type": "query" + }, + { + "allValue": ".*", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "multi": false, + "name": "instance", + "query": "label_values(target_info{job=~\"$job\"}, instance)", + "refresh": 2, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "matrix-authentication-service", + "uid": "matrix-authentication-service" +} diff --git a/misc/grafana/dashboard.libsonnet b/misc/grafana/dashboard.libsonnet new file mode 100644 index 000000000..60bb39fb2 --- /dev/null +++ b/misc/grafana/dashboard.libsonnet @@ -0,0 +1,240 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +local queries = import './queries.libsonnet'; +local variables = import './variables.libsonnet'; + +g.dashboard.new('matrix-authentication-service') ++ g.dashboard.withDescription('A dashboard for monitoring matrix-authentication-service') ++ g.dashboard.withUid('matrix-authentication-service') ++ g.dashboard.withVariables(variables.variables) ++ g.dashboard.graphTooltip.withSharedCrosshair() ++ g.dashboard.withPanels( + g.util.grid.makeGrid([ + g.panel.row.new('Deployment') + + g.panel.row.withPanels([ + g.panel.timeSeries.new('Service Version') + + g.panel.timeSeries.queryOptions.withTargets(queries.serviceVersion) + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.fieldConfig.defaults.custom.withDrawStyle('line') + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(100) + + g.panel.timeSeries.fieldConfig.defaults.custom.withLineWidth(0) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('percent'), + ]), + ], 24, 6) + + + g.util.grid.makeGrid([ + g.panel.row.new('Tokio') + + g.panel.row.withCollapsed(true) + + g.panel.row.withPanels([ + g.panel.stat.new('Workers') + + g.panel.stat.queryOptions.withTargets(queries.tokio.workers), + + g.panel.heatmap.new('Tick Time') + + g.panel.heatmap.gridPos.withStatic() + + g.panel.heatmap.queryOptions.withTargets(queries.tokio.tickTimeHistogram) + + g.panel.heatmap.options.withCalculate(false) + + g.panel.heatmap.options.withCellGap(0) + + g.panel.heatmap.options.yAxis.withUnit('ns'), + + g.panel.timeSeries.new('Tick Time Percentile') + + g.panel.timeSeries.queryOptions.withTargets([ + queries.tokio.tickTimePercentile(0.99), + queries.tokio.tickTimePercentile(0.95), + queries.tokio.tickTimePercentile(0.90), + queries.tokio.tickTimePercentile(0.50), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.options.legend.withCalcs(['mean']) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + g.panel.timeSeries.standardOptions.withUnit('ns') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.timeSeries.new('Poll Rate') + + g.panel.timeSeries.queryOptions.withTargets(queries.tokio.pollRate) + + g.panel.timeSeries.standardOptions.withUnit('ops') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.timeSeries.new('Active Tasks') + + g.panel.timeSeries.queryOptions.withTargets(queries.tokio.activeTasks) + + g.panel.timeSeries.standardOptions.withUnit('tasks') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.timeSeries.new('Busy Time') + + g.panel.timeSeries.queryOptions.withTargets(queries.tokio.busyTime) + + g.panel.timeSeries.standardOptions.withUnit('ms') + + g.panel.timeSeries.standardOptions.withMin(0), + ]), + ], panelHeight=6, startY=100) + + + g.util.grid.makeGrid([ + g.panel.row.new('HTTP server') + + g.panel.row.withPanels([ + g.panel.timeSeries.new('Requests') + + g.panel.timeSeries.queryOptions.withTargets(queries.http.requests) + + g.panel.timeSeries.standardOptions.withUnit('reqps') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.legend.withDisplayMode('table') + + g.panel.timeSeries.options.legend.withCalcs(['mean']) + , + + g.panel.timeSeries.new('Errors') + + g.panel.timeSeries.queryOptions.withTargets(queries.http.requestErrors) + + g.panel.timeSeries.standardOptions.withUnit('reqps') + + g.panel.timeSeries.standardOptions.withMin(0) + + g.panel.timeSeries.options.legend.withPlacement('right') + + g.panel.timeSeries.options.legend.withDisplayMode('table') + + g.panel.timeSeries.options.legend.withCalcs(['mean']), + + g.panel.heatmap.new('Request latency (excluding introspection)') + + g.panel.heatmap.queryOptions.withTargets(queries.http.requestLatencyHeatmap('http_route!="/oauth2/introspect"')) + + g.panel.heatmap.options.withCalculate(false) + + g.panel.heatmap.options.withCellGap(0) + + g.panel.heatmap.options.yAxis.withUnit('ms'), + + g.panel.timeSeries.new('Request latency percentiles (excluding introspection)') + + g.panel.timeSeries.queryOptions.withTargets([ + queries.http.requestLatencyPercentile(0.99, 'http_route!="/oauth2/introspect"'), + queries.http.requestLatencyPercentile(0.95, 'http_route!="/oauth2/introspect"'), + queries.http.requestLatencyPercentile(0.90, 'http_route!="/oauth2/introspect"'), + queries.http.requestLatencyPercentile(0.50, 'http_route!="/oauth2/introspect"'), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.options.legend.withCalcs(['mean']) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + g.panel.timeSeries.standardOptions.withUnit('ms') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.heatmap.new('Request latency (introspection only)') + + g.panel.heatmap.queryOptions.withTargets(queries.http.requestLatencyHeatmap('http_route="/oauth2/introspect"')) + + g.panel.heatmap.options.withCalculate(false) + + g.panel.heatmap.options.withCellGap(0) + + g.panel.heatmap.options.yAxis.withUnit('ms'), + + g.panel.timeSeries.new('Request latency percentiles (introspection only)') + + g.panel.timeSeries.queryOptions.withTargets([ + queries.http.requestLatencyPercentile(0.99, 'http_route="/oauth2/introspect"'), + queries.http.requestLatencyPercentile(0.95, 'http_route="/oauth2/introspect"'), + queries.http.requestLatencyPercentile(0.90, 'http_route="/oauth2/introspect"'), + queries.http.requestLatencyPercentile(0.50, 'http_route="/oauth2/introspect"'), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.options.legend.withCalcs(['mean']) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + g.panel.timeSeries.standardOptions.withUnit('ms') + + g.panel.timeSeries.standardOptions.withMin(0), + ]), + ], panelWidth=12, startY=200) + + + g.util.grid.makeGrid([ + g.panel.row.new('Database') + + g.panel.row.withPanels([ + g.panel.heatmap.new('Database connection acquisition latency') + + g.panel.heatmap.queryOptions.withTargets(queries.database.acquisitionLatencyHeatmap) + + g.panel.heatmap.options.withCalculate(false) + + g.panel.heatmap.options.withCellGap(0) + + g.panel.heatmap.options.yAxis.withUnit('ms'), + + g.panel.timeSeries.new('Database connection acquisition latency percentiles') + + g.panel.timeSeries.queryOptions.withTargets([ + queries.database.acquisitionLatencyPercentile(0.99), + queries.database.acquisitionLatencyPercentile(0.95), + queries.database.acquisitionLatencyPercentile(0.90), + queries.database.acquisitionLatencyPercentile(0.50), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.options.legend.withCalcs(['mean']) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + g.panel.timeSeries.standardOptions.withUnit('ms') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.timeSeries.new('Database connection pool usage') + + g.panel.timeSeries.queryOptions.withTargets(queries.database.poolUsage) + + g.panel.timeSeries.fieldConfig.defaults.custom.withDrawStyle('line') + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(100) + + g.panel.timeSeries.fieldConfig.defaults.custom.withLineWidth(0) + + g.panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal') + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.standardOptions.withMin(0), + ]), + ], startY=300) + + + g.util.grid.makeGrid([ + g.panel.row.new('Jobs') + + g.panel.row.withPanels([ + g.panel.heatmap.new('Job run duration') + + g.panel.heatmap.queryOptions.withTargets(queries.jobs.durationHistogram) + + g.panel.heatmap.options.withCalculate(false) + + g.panel.heatmap.options.withCellGap(0) + + g.panel.heatmap.options.yAxis.withUnit('ms'), + + g.panel.timeSeries.new('Job run P95 per type') + + g.panel.timeSeries.queryOptions.withTargets( + queries.jobs.durationPercentilePerType(0.95), + ) + + g.panel.timeSeries.standardOptions.withUnit('ms') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.timeSeries.new('Job runs') + + g.panel.timeSeries.queryOptions.withTargets(queries.jobs.rate) + + g.panel.timeSeries.standardOptions.withUnit('ops') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.heatmap.new('Worker tick duration') + + g.panel.heatmap.queryOptions.withTargets(queries.jobs.tickHistogram) + + g.panel.heatmap.options.withCalculate(false) + + g.panel.heatmap.options.withCellGap(0) + + g.panel.heatmap.options.yAxis.withUnit('ms'), + + g.panel.timeSeries.new('Worker tick duration percentiles') + + g.panel.timeSeries.queryOptions.withTargets([ + queries.jobs.tickPercentile(0.99), + queries.jobs.tickPercentile(0.95), + queries.jobs.tickPercentile(0.90), + queries.jobs.tickPercentile(0.50), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.options.legend.withCalcs(['mean']) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + g.panel.timeSeries.standardOptions.withUnit('ms') + + g.panel.timeSeries.standardOptions.withMin(0), + ]), + ], startY=400) + + + g.util.grid.makeGrid([ + g.panel.row.new('Activity tracker') + + g.panel.row.withPanels([ + g.panel.timeSeries.new('Record rate') + + g.panel.timeSeries.queryOptions.withTargets(queries.activityTracker.recordRate) + + g.panel.timeSeries.standardOptions.withUnit('ops') + + g.panel.timeSeries.standardOptions.withMin(0), + + g.panel.heatmap.new('Flush time') + + g.panel.heatmap.queryOptions.withTargets(queries.activityTracker.flushTimeHistogram) + + g.panel.heatmap.options.withCalculate(false) + + g.panel.heatmap.options.withCellGap(0) + + g.panel.heatmap.options.yAxis.withUnit('ms'), + + g.panel.timeSeries.new('Flush time percentiles') + + g.panel.timeSeries.queryOptions.withTargets([ + queries.activityTracker.flushTimePercentile(0.99), + queries.activityTracker.flushTimePercentile(0.95), + queries.activityTracker.flushTimePercentile(0.90), + queries.activityTracker.flushTimePercentile(0.50), + ]) + + g.panel.timeSeries.options.tooltip.withMode('multi') + + g.panel.timeSeries.options.tooltip.withSort('desc') + + g.panel.timeSeries.options.legend.withCalcs(['mean']) + + g.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10) + + g.panel.timeSeries.standardOptions.withUnit('ms') + + g.panel.timeSeries.standardOptions.withMin(0), + ]), + ], startY=500) +) diff --git a/misc/grafana/jsonnetfile.json b/misc/grafana/jsonnetfile.json new file mode 100644 index 000000000..2d56d9124 --- /dev/null +++ b/misc/grafana/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "main" + } + ], + "legacyImports": false +} diff --git a/misc/grafana/jsonnetfile.lock.json b/misc/grafana/jsonnetfile.lock.json new file mode 100644 index 000000000..ce3ca834d --- /dev/null +++ b/misc/grafana/jsonnetfile.lock.json @@ -0,0 +1,46 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "d20e609202733790caf5b554c9945d049f243ae3", + "sum": "V9vAj21qJOc2DlMPDgB1eEjSQU4A+sAA4AXuJ6bd4xc=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-v11.4.0" + } + }, + "version": "d20e609202733790caf5b554c9945d049f243ae3", + "sum": "aVAX09paQYNOoCSKVpuk1exVIyBoMt/C50QJI+Q/3nA=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/docsonnet.git", + "subdir": "doc-util" + } + }, + "version": "6ac6c69685b8c29c54515448eaca583da2d88150", + "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U=" + }, + { + "source": { + "git": { + "remote": "https://github.com/jsonnet-libs/xtd.git", + "subdir": "" + } + }, + "version": "4eee017d21cb63a303925d1dcd9fc5c496809b46", + "sum": "Kh0GbIycNmJPzk6IOMXn1BbtLNyaiiimclYk7+mvsns=" + } + ], + "legacyImports": false +} diff --git a/misc/grafana/queries.libsonnet b/misc/grafana/queries.libsonnet new file mode 100644 index 000000000..67d773738 --- /dev/null +++ b/misc/grafana/queries.libsonnet @@ -0,0 +1,251 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local prometheusQuery = g.query.prometheus; +local variables = import './variables.libsonnet'; + +local datasource = '${%s}' % variables.datasource.name; + +{ + serviceVersion: + prometheusQuery.new( + datasource, + 'sum by (service_version) (target_info{%s})' + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('{{service_version}}'), + + tokio: { + tickTimeHistogram: + prometheusQuery.new( + datasource, + 'sum by (le) (irate(tokio_runtime_worker_poll_time_bucket{%s}[$__rate_interval]))' + % [variables.selectors], + ) + + prometheusQuery.withFormat('heatmap'), + + tickTimePercentile(percentile): + prometheusQuery.new( + datasource, + 'histogram_quantile(%0.2f, sum by (le) (irate(tokio_runtime_worker_poll_time_bucket{%s}[$__rate_interval])))' + % [percentile, variables.selectors], + ) + + prometheusQuery.withLegendFormat('P%02d' % (percentile * 100)) + + prometheusQuery.withRefId('P%02d' % (percentile * 100)), + + pollRate: + prometheusQuery.new( + datasource, + 'sum(irate(tokio_runtime_worker_polls_total{%s}[$__rate_interval]))' + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('Polls'), + + activeTasks: + prometheusQuery.new( + datasource, + 'sum(tokio_runtime_alive_tasks{%s})' + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('Active Tasks'), + + busyTime: + prometheusQuery.new( + datasource, + 'sum(irate(tokio_runtime_worker_busy_duration_milliseconds_total{%s}[$__rate_interval]))' + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('Worker busy Time'), + + workers: + prometheusQuery.new( + datasource, + 'sum(tokio_runtime_workers{%s})' + % [variables.selectors], + ), + }, + + http: { + requests: + prometheusQuery.new( + datasource, + ||| + sum by (http_request_method, http_route, http_response_status_code ) + (irate(http_server_duration_count{%s}[$__rate_interval])) + ||| + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('{{http_request_method}} {{http_route}} {{http_response_status_code}}'), + + requestErrors: + prometheusQuery.new( + datasource, + ||| + sum by (http_request_method, http_route, http_response_status_code) + (irate(http_server_duration_count{%s, http_response_status_code=~"4..|5.."}[$__rate_interval])) + ||| + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('{{http_request_method}} {{http_route}} {{http_response_status_code}}'), + + requestLatencyHeatmap(selector=''): + prometheusQuery.new( + datasource, + ||| + sum by (le) ( + irate(http_server_duration_bucket{%s, %s}[$__rate_interval]) + ) + ||| + % [variables.selectors, selector], + ) + + prometheusQuery.withFormat('heatmap'), + + requestLatencyPercentile(percentile, selector=''): + prometheusQuery.new( + datasource, + ||| + histogram_quantile(%0.2f, sum by (le) ( + irate(http_server_duration_bucket{%s, %s}[$__rate_interval]) + )) + ||| + % [percentile, variables.selectors, selector], + ) + + prometheusQuery.withLegendFormat('P%02d' % (percentile * 100)) + + prometheusQuery.withRefId('P%02d' % (percentile * 100)), + }, + + database: { + acquisitionLatencyHeatmap: + prometheusQuery.new( + datasource, + ||| + sum by (le) ( + irate(db_client_connections_create_time_milliseconds_bucket{%s}[$__rate_interval]) + ) + ||| + % [variables.selectors], + ) + + prometheusQuery.withFormat('heatmap'), + + acquisitionLatencyPercentile(percentile): + prometheusQuery.new( + datasource, + ||| + histogram_quantile(%0.2f, sum by (le) ( + irate(db_client_connections_create_time_milliseconds_bucket{%s}[$__rate_interval]) + )) + ||| + % [percentile, variables.selectors], + ) + + prometheusQuery.withLegendFormat('P%02d' % (percentile * 100)), + + poolUsage: + prometheusQuery.new( + datasource, + ||| + sum by (state) (db_connections_usage{%s}) + ||| + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('Connection {{state}}'), + }, + + jobs: { + durationHistogram: + prometheusQuery.new( + datasource, + ||| + sum by (le) ( + irate(job_process_duration_milliseconds_bucket{%s}[$__rate_interval]) + ) + ||| + % [variables.selectors], + ) + + prometheusQuery.withFormat('heatmap'), + + durationPercentilePerType(percentile): + prometheusQuery.new( + datasource, + ||| + histogram_quantile(%0.2f, sum by (le, job_queue_name) ( + irate(job_process_duration_milliseconds_bucket{%s}[$__rate_interval]) + )) + ||| + % [percentile, variables.selectors], + ) + + prometheusQuery.withLegendFormat('{{job_queue_name}}') + + prometheusQuery.withRefId('P%02d' % (percentile * 100)), + + rate: + prometheusQuery.new( + datasource, + ||| + sum by(job_queue_name, job_result) + (irate(job_process_duration_milliseconds_count{%s}[$__rate_interval])) + ||| + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('{{job_queue_name}} {{job_result}}'), + + tickHistogram: + prometheusQuery.new( + datasource, + ||| + sum by (le) ( + irate(job_worker_tick_duration_bucket{%s}[$__rate_interval]) + ) + ||| + % [variables.selectors], + ) + + prometheusQuery.withFormat('heatmap'), + + tickPercentile(percentile): + prometheusQuery.new( + datasource, + ||| + histogram_quantile(%0.2f, sum by (le) ( + irate(job_worker_tick_duration_bucket{%s}[$__rate_interval]) + )) + ||| + % [percentile, variables.selectors], + ) + + prometheusQuery.withLegendFormat('P%02d' % (percentile * 100)) + + prometheusQuery.withRefId('P%02d' % (percentile * 100)), + }, + + activityTracker: { + recordRate: + prometheusQuery.new( + datasource, + ||| + sum by(session_kind) + (irate(mas_activity_tracker_messages_total{%s, type="record"}[$__rate_interval])) + ||| + % [variables.selectors], + ) + + prometheusQuery.withLegendFormat('{{session_kind}}'), + + flushTimeHistogram: + prometheusQuery.new( + datasource, + ||| + sum by (le) ( + irate(mas_activity_tracker_flush_time_milliseconds_bucket{%s}[$__rate_interval]) + ) + ||| + % [variables.selectors], + ) + + prometheusQuery.withFormat('heatmap'), + + flushTimePercentile(percentile): + prometheusQuery.new( + datasource, + ||| + histogram_quantile(%0.2f, sum by (le) ( + irate(mas_activity_tracker_flush_time_milliseconds_bucket{%s}[$__rate_interval]) + )) + ||| + % [percentile, variables.selectors], + ) + + prometheusQuery.withLegendFormat('P%02d' % (percentile * 100)) + + prometheusQuery.withRefId('P%02d' % (percentile * 100)), + }, +} diff --git a/misc/grafana/variables.libsonnet b/misc/grafana/variables.libsonnet new file mode 100644 index 000000000..eca7cc94c --- /dev/null +++ b/misc/grafana/variables.libsonnet @@ -0,0 +1,40 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local var = g.dashboard.variable; + +{ + datasource: + var.datasource.new('datasource', 'prometheus') + + var.query.selectionOptions.withIncludeAll(false) + + var.query.selectionOptions.withMulti(false) + + var.query.refresh.onLoad(), + + job: + var.query.new('job') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues('job', 'target_info') + + var.query.selectionOptions.withIncludeAll(true) + + var.query.selectionOptions.withMulti(false) + + var.query.refresh.onLoad(), + + instance: + var.query.new('instance') + + var.query.withDatasourceFromVariable(self.datasource) + + var.query.queryTypes.withLabelValues( + 'instance', + 'target_info{job=~"$%s"}' % [self.job.name], + ) + + var.query.selectionOptions.withIncludeAll(true, '.*') + + var.query.selectionOptions.withMulti(false) + + var.query.refresh.onTime(), + + variables: [ + self.datasource, + self.job, + self.instance, + ], + + selectors: 'job=~"$%s", instance=~"$%s"' % [ + self.job.name, + self.instance.name, + ], +}