diff --git a/README.md b/README.md index 0b13392..d37b4bf 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ Each resource has an associated module that will create Cloud Observability dash * __OpenTelemetry Collector__ (module: [`otel-collector-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-dashboard)) * __Dockerstats Metrics__ (module: [`otel-collector-dockerstats-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-dockerstats-dashboard)) * __OpenTelemetry Elasticsearchreceiver Receiver__ (module: [`otel-collector-elasticsearchreceiver-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-elasticsearchreceiver-dashboard)) +* __Envoy - Overview__ (module: [`otel-collector-envoy-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-envoy-dashboard)) * __ETCD v3 - Overview__ (module: [`otel-collector-etcd-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-etcd-dashboard)) * __Flink - Overview__ (module: [`otel-collector-flink-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-flink-dashboard)) * __Fluentd Records__ (module: [`otel-collector-fluentd-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-fluentd-dashboard)) @@ -45,6 +46,7 @@ Each resource has an associated module that will create Cloud Observability dash * __OpenTelemetry / Host Metrics / Paging__ (module: [`otel-collector-hostmetrics-paging-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-hostmetrics-paging-dashboard)) * __OpenTelemetry IBMMQ Integration__ (module: [`otel-collector-ibmmq-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-ibmmq-dashboard)) * __OpenTelemetry iisreceiver Integration__ (module: [`otel-collector-iisreceiver-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-iisreceiver-dashboard)) +* __OpenTelemetry JBoss Wildfly Dashboard__ (module: [`otel-collector-jbosswildfly-prom-receiver-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-jbosswildfly-prom-receiver-dashboard)) * __K8S Kubelet__ (module: [`otel-collector-k8s-kubelet-prom-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-k8s-kubelet-prom-dashboard)) * __Node Exporter__ (module: [`otel-collector-k8s-node-exporter-prom-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-k8s-node-exporter-prom-dashboard)) * __Kubernetes Resources - Pod__ (module: [`otel-collector-k8s-pod-resources-prom-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-k8s-pod-resources-prom-dashboard)) diff --git a/collector-dashboards/otel-collector-envoy-dashboard/main.tf b/collector-dashboards/otel-collector-envoy-dashboard/main.tf new file mode 100644 index 0000000..5d429db --- /dev/null +++ b/collector-dashboards/otel-collector-envoy-dashboard/main.tf @@ -0,0 +1,373 @@ +terraform { + required_providers { + lightstep = { + source = "lightstep/lightstep" + version = "~> 1.79.0" + } + } + required_version = ">= v1.0.11" +} + +resource "lightstep_dashboard" "otel_collector_envoy_dashboard" { + project_name = var.cloud_observability_project + dashboard_name = "Envoy - Overview" + dashboard_description = "This dashboard provides a high-level overview of your Envoy cluster so you can monitor its performance and resource usage." + + group { + rank = 3 + title = "Listeners" + visibility_type = "explicit" + + chart { + name = "Listeners Success Rate (Excluding Admin Interface)" + type = "timeseries" + rank = 0 + x_pos = 0 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "line" + hidden = false + query_string = "with\n a = metric envoy_http_downstream_rq_xx | filter ((((envoy_response_code_class != \"4\") && (envoy_response_code_class != 4)) && (envoy_response_code_class != 4.0)) && (((envoy_response_code_class != \"5\") && (envoy_response_code_class != 5)) && (envoy_response_code_class != 5.0))) | rate | group_by [], sum;\n b = metric envoy_http_downstream_rq_completed | filter (((envoy_response_code_class == \"2\") || (envoy_response_code_class == 2)) || (envoy_response_code_class == 2.0)) | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Listeners Response Time Percentiles" + type = "timeseries" + rank = 1 + x_pos = 16 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_rq_time | delta | group_by [], sum | point percentile(value, 50.0), percentile(value, 95.0), percentile(value, 99.0), percentile(value, 99.9)" + } + } + chart { + name = "Listener Traffic" + type = "timeseries" + rank = 2 + x_pos = 32 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_rq_completed | rate | group_by [], sum" + } + query { + query_name = "b" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_tx_bytes_total | rate | group_by [], sum" + } + } + chart { + name = "Requests Rejected By Reason" + type = "timeseries" + rank = 3 + x_pos = 0 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_no_route | rate | group_by [], sum" + } + query { + query_name = "b" + display = "line" + hidden = false + query_string = "metric envoy_http_no_cluster | rate | group_by [], sum" + } + query { + query_name = "c" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_protocol_error | rate | group_by [], sum" + } + } + chart { + name = "Active Connections Per Type and Listener" + type = "timeseries" + rank = 4 + x_pos = 16 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_active | latest | group_by [], sum" + } + query { + query_name = "b" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_http2_active | latest | group_by [], sum" + } + query { + query_name = "c" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_http1_active | latest | group_by [], sum" + } + } + } + group { + rank = 0 + title = "" + visibility_type = "implicit" + } + group { + rank = 1 + title = "Overview" + visibility_type = "explicit" + + chart { + name = "Incoming Success Rate (Non-5xx Responses)" + type = "timeseries" + rank = 0 + x_pos = 0 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((1-(a/b))*100)" + display = "big_number" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"5\") || (envoy_response_code_class == 5)) || (envoy_response_code_class == 5.0)) | rate 5m, 5m | group_by [], mean;\n b = metric envoy_cluster_upstream_rq_completed | rate 5m, 5m | group_by [], mean;\njoin (((1-(a / b))*100)), a=0, b=0 | reduce 5m, mean" + } + } + chart { + name = "Incoming Requests Volume" + type = "timeseries" + rank = 1 + x_pos = 16 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "big_number" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | rate 5m | group_by [], mean" + } + } + chart { + name = "Incoming Requests by Release" + type = "timeseries" + rank = 2 + x_pos = 32 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "big_number" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_total | rate 5m | group_by [], mean" + } + } + } + group { + rank = 2 + title = "Upstream Clusters" + visibility_type = "explicit" + + chart { + name = "Upstream Response 2xx (% Breakdown)" + type = "timeseries" + rank = 0 + x_pos = 0 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"2\") || (envoy_response_code_class == 2)) || (envoy_response_code_class == 2.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 3xx (% Breakdown)" + type = "timeseries" + rank = 1 + x_pos = 16 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"3\") || (envoy_response_code_class == 3)) || (envoy_response_code_class == 3.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 4xx (% Breakdown)" + type = "timeseries" + rank = 2 + x_pos = 32 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"4\") || (envoy_response_code_class == 4)) || (envoy_response_code_class == 4.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 5xx (% Breakdown)" + type = "timeseries" + rank = 3 + x_pos = 0 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"5\") || (envoy_response_code_class == 5)) || (envoy_response_code_class == 5.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 2xx (Total Breakdown)" + type = "timeseries" + rank = 4 + x_pos = 16 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"2\") || (envoy_response_code_class == 2)) || (envoy_response_code_class == 2.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream Response 3xx (Total Breakdown)" + type = "timeseries" + rank = 5 + x_pos = 32 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"3\") || (envoy_response_code_class == 3)) || (envoy_response_code_class == 3.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream Response 4xx (Total Breakdown)" + type = "timeseries" + rank = 6 + x_pos = 0 + y_pos = 16 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"4\") || (envoy_response_code_class == 4)) || (envoy_response_code_class == 4.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream Response 5xx (Total Breakdown)" + type = "timeseries" + rank = 7 + x_pos = 16 + y_pos = 16 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"5\") || (envoy_response_code_class == 5)) || (envoy_response_code_class == 5.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream p99 Response Time" + type = "timeseries" + rank = 8 + x_pos = 32 + y_pos = 16 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_time | delta | group_by [], sum | point percentile(value, 99.0)" + } + } + chart { + name = "Average Upstream Traffic Rate" + type = "timeseries" + rank = 9 + x_pos = 0 + y_pos = 24 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_total | rate | group_by [], sum" + } + } + chart { + name = "Cluster Load Balancer Panics" + type = "timeseries" + rank = 10 + x_pos = 16 + y_pos = 24 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_cluster_lb_healthy_panic | rate | group_by [], sum" + } + } + } +} \ No newline at end of file diff --git a/collector-dashboards/otel-collector-envoy-dashboard/outputs.tf b/collector-dashboards/otel-collector-envoy-dashboard/outputs.tf new file mode 100644 index 0000000..e55bc66 --- /dev/null +++ b/collector-dashboards/otel-collector-envoy-dashboard/outputs.tf @@ -0,0 +1,4 @@ +output "dashboard_url" { + value = "https://app.lightstep.com/${var.cloud_observability_project}/dashboard/${lightstep_dashboard.otel_collector_envoy_dashboard.id}" + description = "OpenTelemetry Collector Flink Metrics Dashboard URL" +} diff --git a/collector-dashboards/otel-collector-envoy-dashboard/variables.tf b/collector-dashboards/otel-collector-envoy-dashboard/variables.tf new file mode 100644 index 0000000..a52e44f --- /dev/null +++ b/collector-dashboards/otel-collector-envoy-dashboard/variables.tf @@ -0,0 +1,4 @@ +variable "cloud_observability_project" { + description = "Name of Cloud Observability project" + type = string +} diff --git a/collector-dashboards/otel-collector-jbosswildfly-prom-receiver-dashboard/main.tf b/collector-dashboards/otel-collector-jbosswildfly-prom-receiver-dashboard/main.tf index eadf058..8f639f5 100644 --- a/collector-dashboards/otel-collector-jbosswildfly-prom-receiver-dashboard/main.tf +++ b/collector-dashboards/otel-collector-jbosswildfly-prom-receiver-dashboard/main.tf @@ -2,7 +2,7 @@ terraform { required_providers { lightstep = { source = "lightstep/lightstep" - version = "~> 1.70.10" + version = "~> 1.79.0" } } required_version = ">= v1.0.11" diff --git a/main.tf b/main.tf index a8f163b..3b645b6 100644 --- a/main.tf +++ b/main.tf @@ -84,6 +84,11 @@ module "cloud_observability_otel_collector_elasticsearchreceiver_dashboard" { cloud_observability_project = var.cloud_observability_project } +module "cloud_observability_otel_collector_envoy_dashboard" { + source = "./collector-dashboards/otel-collector-envoy-dashboard" + cloud_observability_project = var.cloud_observability_project +} + module "cloud_observability_otel_collector_etcd_dashboard" { source = "./collector-dashboards/otel-collector-etcd-dashboard" cloud_observability_project = var.cloud_observability_project @@ -159,6 +164,11 @@ module "cloud_observability_otel_collector_iisreceiver_dashboard" { cloud_observability_project = var.cloud_observability_project } +module "cloud_observability_otel_collector_jbosswildfly_prom_receiver_dashboard" { + source = "./collector-dashboards/otel-collector-jbosswildfly-prom-receiver-dashboard" + cloud_observability_project = var.cloud_observability_project +} + module "cloud_observability_otel_collector_k8s_kubelet_prom_dashboard" { source = "./collector-dashboards/otel-collector-k8s-kubelet-prom-dashboard" cloud_observability_project = var.cloud_observability_project diff --git a/outputs.tf b/outputs.tf index 38eba05..c69bb64 100644 --- a/outputs.tf +++ b/outputs.tf @@ -69,6 +69,11 @@ output "cloud_observability_otel_collector_elasticsearchreceiver_dashboard_url" description = "Cloud Observability OpenTelemetry OpenTelemetry Elasticsearchreceiver Receiver Dashboard" } +output "cloud_observability_otel_collector_envoy_dashboard_url" { + value = module.cloud_observability_otel_collector_envoy_dashboard.dashboard_url + description = "Cloud Observability OpenTelemetry Envoy - Overview Dashboard" +} + output "cloud_observability_otel_collector_etcd_dashboard_url" { value = module.cloud_observability_otel_collector_etcd_dashboard.dashboard_url description = "Cloud Observability OpenTelemetry ETCD v3 - Overview Dashboard" @@ -144,6 +149,11 @@ output "cloud_observability_otel_collector_iisreceiver_dashboard_url" { description = "Cloud Observability OpenTelemetry OpenTelemetry iisreceiver Integration Dashboard" } +output "cloud_observability_otel_collector_jbosswildfly_prom_receiver_dashboard_url" { + value = module.cloud_observability_otel_collector_jbosswildfly_prom_receiver_dashboard.dashboard_url + description = "Cloud Observability OpenTelemetry OpenTelemetry JBoss Wildfly Dashboard Dashboard" +} + output "cloud_observability_otel_collector_k8s_kubelet_prom_dashboard_url" { value = module.cloud_observability_otel_collector_k8s_kubelet_prom_dashboard.dashboard_url description = "Cloud Observability OpenTelemetry K8S Kubelet Dashboard" @@ -175,12 +185,12 @@ output "cloud_observability_otel_collector_kubeletstatsreceiver_dashboard_url" { } output "cloud_observability_otel_collector_kubernetes_comprehensive_dashboard_url" { - value = module.cloud_observability_otel_collector_kubernetes_comprehensive_dashboard.dashboard_url + value = module.cloud_observability_otel_collector_kubernetes_comprehensive_dashboard.cluster_dashboard_url description = "Cloud Observability OpenTelemetry OpenTelemetry Collector Comprehensive Kubernetes Dashboard Dashboard" } output "cloud_observability_otel_collector_kubernetes_dashboard_url" { - value = module.cloud_observability_otel_collector_kubernetes_dashboard.dashboard_url + value = module.cloud_observability_otel_collector_kubernetes_dashboard.cluster_dashboard_url description = "Cloud Observability OpenTelemetry Dashboard" }