From 56546edfad7bfd53e99a768f4b92ec4778a53f72 Mon Sep 17 00:00:00 2001 From: Dinesh Gurumurthy Date: Sat, 7 Feb 2026 20:05:29 +0000 Subject: [PATCH] [manual-container-metrics] Update dependencies, align with OTel/DD best practices - Update Go OTel SDK from v1.18.0 to v1.40.0 (latest stable) - Update otelhttp from v0.44.0 to v0.65.0 - Update Go version from 1.20 to 1.24 - Change metric instruments from UpDownCounter to Gauge (correct OTel instrument for point-in-time container resource measurements) - Add metric descriptions and units per OTel semantic conventions - Add proper error handling throughout (return errors instead of swallowing them) - Add graceful shutdown with signal handling (SIGINT/SIGTERM) - Add HTTP server timeouts (read/write/idle) - Add /health endpoint for basic health checks - Add startup probe in Kubernetes manifest - Use multi-stage Docker build with distroless base image - Add pod and container security context (non-root, read-only FS, drop all capabilities) - Right-size resource requests/limits (100m/128Mi to 500m/256Mi) - Add comments documenting Datadog metric name mapping - Add OTel resource creation with semconv service.name - Remove unused go.uber.org/zap dependency - Document environment variables, endpoints, and metrics in README Co-Authored-By: Claude Opus 4.6 --- apps/manual-container-metrics/Dockerfile | 9 +- apps/manual-container-metrics/README.md | 59 ++++- apps/manual-container-metrics/app/go.mod | 53 ++-- apps/manual-container-metrics/app/go.sum | 129 +++++----- apps/manual-container-metrics/app/main.go | 284 +++++++++++++++++----- apps/manual-container-metrics/values.yaml | 63 ++++- 6 files changed, 419 insertions(+), 178 deletions(-) diff --git a/apps/manual-container-metrics/Dockerfile b/apps/manual-container-metrics/Dockerfile index f04f190..4bc7f4c 100644 --- a/apps/manual-container-metrics/Dockerfile +++ b/apps/manual-container-metrics/Dockerfile @@ -1,14 +1,19 @@ # Unless explicitly stated otherwise all files in this repository are licensed # under the Apache 2.0 License. # -FROM golang:1.20 +FROM golang:1.24 AS builder WORKDIR /app COPY app manual-container-metrics-app/ WORKDIR /app/manual-container-metrics-app RUN go mod download +RUN CGO_ENABLED=0 go build -o /manual-container-metrics-app -RUN go build -o /manual-container-metrics-app +FROM gcr.io/distroless/static-debian12:nonroot + +COPY --from=builder /manual-container-metrics-app /manual-container-metrics-app + +EXPOSE 3000 CMD [ "/manual-container-metrics-app" ] diff --git a/apps/manual-container-metrics/README.md b/apps/manual-container-metrics/README.md index c35dc2f..58fa4d9 100644 --- a/apps/manual-container-metrics/README.md +++ b/apps/manual-container-metrics/README.md @@ -1,13 +1,60 @@ # Manual Container Metrics Application -This project consists of a Go server instrumented with OpenTelemetry with the OpenTelemetry Collector. -The server manually creates metrics with the same names as container metrics from the docker runtime. These metrics are assigned the same `container.id` and `container.name` as the server to demonstrate that trace container metrics correlation works in the trace app. Traces will be automatically generated from the Kubernetes liveness and readiness requests. +This project consists of a Go server instrumented with OpenTelemetry, exporting via the OTLP protocol to the OTel Collector (or Datadog Agent with OTLP ingestion enabled). + +The server manually creates metrics with the same names as container metrics from the Docker/containerd runtime. These metrics are assigned the same `container.id` and `container.name` as the server container to demonstrate that trace-to-container-metrics correlation works in the Datadog APM trace view. Traces are automatically generated from Kubernetes liveness and readiness probe requests. + +## Container Metrics + +The following container metrics are emitted as OTel Gauge instruments: + +| Metric Name | Unit | Description | +|---|---|---| +| `container.cpu.usage` | ns | Total CPU usage in nanoseconds | +| `container.cpu.limit` | {cpus} | CPU limit assigned to the container | +| `container.cpu.user` | ns | User CPU time in nanoseconds | +| `container.cpu.system` | ns | System CPU time in nanoseconds | +| `container.memory.rss` | By | Resident set size memory in bytes | +| `container.memory.usage` | By | Total memory usage in bytes | +| `container.memory.limit` | By | Memory limit in bytes | +| `container.io.read` | By | Bytes read from disk | +| `container.io.write` | By | Bytes written to disk | +| `container.net.sent` | By | Bytes sent over network | +| `container.net.rcvd` | By | Bytes received over network | + +## Environment Variables + +| Variable | Description | Required | +|---|---|---| +| `OTEL_SERVICE_NAME` | Service name for OTel resource | Yes | +| `OTEL_CONTAINER_NAME` | Container name for metric correlation | Yes | +| `OTEL_K8S_CONTAINER_ID` | Container/pod ID for metric correlation | Yes | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP collector endpoint (e.g., `http://host:4317`) | Yes | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | OTLP protocol (`grpc`) | Yes | +| `OTEL_RESOURCE_ATTRIBUTES` | Additional OTel resource attributes | No | + +## Endpoints + +| Path | Description | +|---|---| +| `/health` | Health check (returns `{"status":"healthy"}`) | +| `/readiness` | Readiness probe with trace correlation | +| `/liveness` | Liveness probe with trace correlation | ## Docker Build -This application can be built with the following command: -``` + +Build the application image: + +```bash docker build -t . --platform linux/amd64 ``` -## Deploying -After building the Docker image, the tag can be pushed and added into `values.yaml` to be deployed with Kubernetes. +## Deploying to Kubernetes + +After building and pushing the Docker image, update the image tag in `values.yaml` and apply: + +```bash +kubectl apply -f values.yaml +``` + +The deployment expects an OTel Collector or Datadog Agent running on each node with OTLP ingestion enabled on port 4317. diff --git a/apps/manual-container-metrics/app/go.mod b/apps/manual-container-metrics/app/go.mod index 1d49403..1393d02 100644 --- a/apps/manual-container-metrics/app/go.mod +++ b/apps/manual-container-metrics/app/go.mod @@ -1,36 +1,35 @@ module manual-container-metrics-app -go 1.20 +go 1.24.0 require ( - github.com/sirupsen/logrus v1.9.3 - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.44.0 - go.opentelemetry.io/otel v1.18.0 - go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.41.0 - go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.18.0 - go.opentelemetry.io/otel/metric v1.18.0 - go.opentelemetry.io/otel/sdk v1.18.0 - go.opentelemetry.io/otel/sdk/metric v0.41.0 - go.opentelemetry.io/otel/trace v1.18.0 - go.uber.org/zap v1.26.0 + github.com/sirupsen/logrus v1.9.4 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.65.0 + go.opentelemetry.io/otel v1.40.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 + go.opentelemetry.io/otel/metric v1.40.0 + go.opentelemetry.io/otel/sdk v1.40.0 + go.opentelemetry.io/otel/sdk/metric v1.40.0 + go.opentelemetry.io/otel/trace v1.40.0 ) require ( - github.com/cenkalti/backoff/v4 v4.2.1 // indirect - github.com/felixge/httpsnoop v1.0.3 // indirect - github.com/go-logr/logr v1.2.4 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect - github.com/golang/protobuf v1.5.3 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.41.0 // indirect - go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.18.0 // indirect - go.opentelemetry.io/proto/otlp v1.0.0 // indirect - go.uber.org/multierr v1.10.0 // indirect - golang.org/x/net v0.17.0 // indirect - golang.org/x/sys v0.13.0 // indirect - golang.org/x/text v0.13.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20230711160842-782d3b101e98 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect - google.golang.org/grpc v1.58.3 // indirect - google.golang.org/protobuf v1.31.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 // indirect + go.opentelemetry.io/auto/sdk v1.2.1 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 // indirect + go.opentelemetry.io/proto/otlp v1.9.0 // indirect + golang.org/x/net v0.49.0 // indirect + golang.org/x/sys v0.40.0 // indirect + golang.org/x/text v0.33.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260203192932-546029d2fa20 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260203192932-546029d2fa20 // indirect + google.golang.org/grpc v1.78.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect ) diff --git a/apps/manual-container-metrics/app/go.sum b/apps/manual-container-metrics/app/go.sum index bbfa5a6..016cd98 100644 --- a/apps/manual-container-metrics/app/go.sum +++ b/apps/manual-container-metrics/app/go.sum @@ -1,76 +1,69 @@ -github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= -github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/felixge/httpsnoop v1.0.3 h1:s/nj+GCswXYzN5v2DpNMuMQYe+0DDwt5WVCU6CWBdXk= -github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= -github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= -github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= -github.com/golang/glog v1.1.0 h1:/d3pCKDPWNnvIWe0vVUpNP32qc8U3PDVxySP/y360qE= -github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= -github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= -github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= -github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 h1:YBftPWNWd4WwGqtY2yeZL2ef8rHAxPBD8KFhJpmcqms= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0/go.mod h1:YN5jB8ie0yfIUg6VvR9Kz84aCaG7AsGZnLjhHbUqwPg= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7 h1:X+2YciYSxvMQK0UZ7sg45ZVabVZBeBuvMkmuI2V3Fak= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.7/go.mod h1:lW34nIZuQ8UDPdkon5fmfp2l3+ZkQ2me/+oecHYLOII= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.44.0 h1:KfYpVmrjI7JuToy5k8XV3nkapjWx48k4E4JOtVstzQI= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.44.0/go.mod h1:SeQhzAEccGVZVEy7aH87Nh0km+utSpo1pTv6eMMop48= -go.opentelemetry.io/otel v1.18.0 h1:TgVozPGZ01nHyDZxK5WGPFB9QexeTMXEH7+tIClWfzs= -go.opentelemetry.io/otel v1.18.0/go.mod h1:9lWqYO0Db579XzVuCKFNPDl4s73Voa+zEck3wHaAYQI= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.41.0 h1:k0k7hFNDd8K4iOMJXj7s8sHaC4mhTlAeppRmZXLgZ6k= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric v0.41.0/go.mod h1:hG4Fj/y8TR/tlEDREo8tWstl9fO9gcFkn4xrx0Io8xU= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.41.0 h1:HgbDTD8pioFdY3NRc/YCvsWjqQPtweGyXxa32LgnTOw= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v0.41.0/go.mod h1:tmvt/yK5Es5d6lHYWerLSOna8lCEfrBVX/a9M0ggqss= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.18.0 h1:IAtl+7gua134xcV3NieDhJHjjOVeJhXAnYf/0hswjUY= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.18.0/go.mod h1:w+pXobnBzh95MNIkeIuAKcHe/Uu/CX2PKIvBP6ipKRA= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.18.0 h1:yE32ay7mJG2leczfREEhoW3VfSZIvHaB+gvVo1o8DQ8= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.18.0/go.mod h1:G17FHPDLt74bCI7tJ4CMitEk4BXTYG4FW6XUpkPBXa4= -go.opentelemetry.io/otel/metric v1.18.0 h1:JwVzw94UYmbx3ej++CwLUQZxEODDj/pOuTCvzhtRrSQ= -go.opentelemetry.io/otel/metric v1.18.0/go.mod h1:nNSpsVDjWGfb7chbRLUNW+PBNdcSTHD4Uu5pfFMOI0k= -go.opentelemetry.io/otel/sdk v1.18.0 h1:e3bAB0wB3MljH38sHzpV/qWrOTCFrdZF2ct9F8rBkcY= -go.opentelemetry.io/otel/sdk v1.18.0/go.mod h1:1RCygWV7plY2KmdskZEDDBs4tJeHG92MdHZIluiYs/M= -go.opentelemetry.io/otel/sdk/metric v0.41.0 h1:c3sAt9/pQ5fSIUfl0gPtClV3HhE18DCVzByD33R/zsk= -go.opentelemetry.io/otel/sdk/metric v0.41.0/go.mod h1:PmOmSt+iOklKtIg5O4Vz9H/ttcRFSNTgii+E1KGyn1w= -go.opentelemetry.io/otel/trace v1.18.0 h1:NY+czwbHbmndxojTEKiSMHkG2ClNH2PwmcHrdo0JY10= -go.opentelemetry.io/otel/trace v1.18.0/go.mod h1:T2+SGJGuYZY3bjj5rgh/hN7KIrlpWC5nS8Mjvzckz+0= -go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I= -go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM= -go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= -go.uber.org/multierr v1.10.0 h1:S0h4aNzvfcFsC3dRF1jLoaov7oRaKqRGC/pUEJ2yvPQ= -go.uber.org/multierr v1.10.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= -go.uber.org/zap v1.26.0 h1:sI7k6L95XOKS281NhVKOFCUNIvv9e0w4BF8N3u+tCRo= -go.uber.org/zap v1.26.0/go.mod h1:dtElttAiwGvoJ/vj4IwHBS/gXsEu/pZ50mUIRWuG0so= -golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= -golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= -golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -google.golang.org/genproto v0.0.0-20230711160842-782d3b101e98 h1:Z0hjGZePRE0ZBWotvtrwxFNrNE9CUAGtplaDK5NNI/g= -google.golang.org/genproto/googleapis/api v0.0.0-20230711160842-782d3b101e98 h1:FmF5cCW94Ij59cfpoLiwTgodWmm60eEV0CjlsVg2fuw= -google.golang.org/genproto/googleapis/api v0.0.0-20230711160842-782d3b101e98/go.mod h1:rsr7RhLuwsDKL7RmgDDCUc6yaGr1iqceVb5Wv6f6YvQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 h1:bVf09lpb+OJbByTj913DRJioFFAjf/ZGxEz7MajTp2U= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98/go.mod h1:TUfxEVdsvPg18p6AslUXFoLdpED4oBnGwyqk3dV1XzM= -google.golang.org/grpc v1.58.3 h1:BjnpXut1btbtgN/6sp+brB2Kbm2LjNXnidYujAVbSoQ= -google.golang.org/grpc v1.58.3/go.mod h1:tgX3ZQDlNJGU96V6yHh1T/JeoBQ2TXdr43YbYSsCJk0= -google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= -google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= -google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= -google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.65.0 h1:7iP2uCb7sGddAr30RRS6xjKy7AZ2JtTOPA3oolgVSw8= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.65.0/go.mod h1:c7hN3ddxs/z6q9xwvfLPk+UHlWRQyaeR1LdgfL/66l0= +go.opentelemetry.io/otel v1.40.0 h1:oA5YeOcpRTXq6NN7frwmwFR0Cn3RhTVZvXsP4duvCms= +go.opentelemetry.io/otel v1.40.0/go.mod h1:IMb+uXZUKkMXdPddhwAHm6UfOwJyh4ct1ybIlV14J0g= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0 h1:NOyNnS19BF2SUDApbOKbDtWZ0IK7b8FJ2uAGdIWOGb0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.40.0/go.mod h1:VL6EgVikRLcJa9ftukrHu/ZkkhFBSo1lzvdBC9CF1ss= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0 h1:QKdN8ly8zEMrByybbQgv8cWBcdAarwmIPZ6FThrWXJs= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.40.0/go.mod h1:bTdK1nhqF76qiPoCCdyFIV+N/sRHYXYCTQc+3VCi3MI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0 h1:DvJDOPmSWQHWywQS6lKL+pb8s3gBLOZUtw4N+mavW1I= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.40.0/go.mod h1:EtekO9DEJb4/jRyN4v4Qjc2yA7AtfCBuz2FynRUWTXs= +go.opentelemetry.io/otel/metric v1.40.0 h1:rcZe317KPftE2rstWIBitCdVp89A2HqjkxR3c11+p9g= +go.opentelemetry.io/otel/metric v1.40.0/go.mod h1:ib/crwQH7N3r5kfiBZQbwrTge743UDc7DTFVZrrXnqc= +go.opentelemetry.io/otel/sdk v1.40.0 h1:KHW/jUzgo6wsPh9At46+h4upjtccTmuZCFAc9OJ71f8= +go.opentelemetry.io/otel/sdk v1.40.0/go.mod h1:Ph7EFdYvxq72Y8Li9q8KebuYUr2KoeyHx0DRMKrYBUE= +go.opentelemetry.io/otel/sdk/metric v1.40.0 h1:mtmdVqgQkeRxHgRv4qhyJduP3fYJRMX4AtAlbuWdCYw= +go.opentelemetry.io/otel/sdk/metric v1.40.0/go.mod h1:4Z2bGMf0KSK3uRjlczMOeMhKU2rhUqdWNoKcYrtcBPg= +go.opentelemetry.io/otel/trace v1.40.0 h1:WA4etStDttCSYuhwvEa8OP8I5EWu24lkOzp+ZYblVjw= +go.opentelemetry.io/otel/trace v1.40.0/go.mod h1:zeAhriXecNGP/s2SEG3+Y8X9ujcJOTqQ5RgdEJcawiA= +go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= +go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= +golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/sys v0.40.0 h1:DBZZqJ2Rkml6QMQsZywtnjnnGvHza6BTfYFWY9kjEWQ= +golang.org/x/sys v0.40.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/text v0.33.0 h1:B3njUFyqtHDUI5jMn1YIr5B0IE2U0qck04r6d4KPAxE= +golang.org/x/text v0.33.0/go.mod h1:LuMebE6+rBincTi9+xWTY8TztLzKHc/9C1uBCG27+q8= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +google.golang.org/genproto/googleapis/api v0.0.0-20260203192932-546029d2fa20 h1:7ei4lp52gK1uSejlA8AZl5AJjeLUOHBQscRQZUgAcu0= +google.golang.org/genproto/googleapis/api v0.0.0-20260203192932-546029d2fa20/go.mod h1:ZdbssH/1SOVnjnDlXzxDHK2MCidiqXtbYccJNzNYPEE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260203192932-546029d2fa20 h1:Jr5R2J6F6qWyzINc+4AM8t5pfUz6beZpHp678GNrMbE= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260203192932-546029d2fa20/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= +google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc= +google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/apps/manual-container-metrics/app/main.go b/apps/manual-container-metrics/app/main.go index c6d4e99..1b98186 100644 --- a/apps/manual-container-metrics/app/main.go +++ b/apps/manual-container-metrics/app/main.go @@ -3,10 +3,13 @@ package main import ( "context" "encoding/json" - "go.uber.org/zap" - "io" + "errors" + "fmt" "net/http" "os" + "os/signal" + "syscall" + "time" log "github.com/sirupsen/logrus" @@ -15,146 +18,301 @@ import ( "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" - metric2 "go.opentelemetry.io/otel/metric" + otelmetric "go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/propagation" "go.opentelemetry.io/otel/sdk/metric" "go.opentelemetry.io/otel/sdk/resource" sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" "go.opentelemetry.io/otel/trace" ) +const ( + // metricCollectionInterval is the interval at which metrics are collected and exported. + // 10 seconds aligns with the default Datadog Agent metric collection interval. + metricCollectionInterval = 10 * time.Second + + // serverPort is the port the HTTP server listens on. + serverPort = ":3000" + + // shutdownTimeout is the maximum time to wait for graceful shutdown. + shutdownTimeout = 5 * time.Second +) + func initLogger() { log.SetFormatter(&log.JSONFormatter{}) log.SetOutput(os.Stdout) log.SetLevel(log.InfoLevel) } -func initMeter(ctx context.Context, r *resource.Resource) *metric.MeterProvider { +// initResource creates an OTel resource with service information and environment-provided attributes. +// Resource attributes are used by the Datadog exporter for tagging and container correlation. +func initResource(ctx context.Context) (*resource.Resource, error) { + serviceName := getEnvOrDefault("OTEL_SERVICE_NAME", "manual-container-metrics-app") + return resource.New(ctx, + resource.WithFromEnv(), + resource.WithTelemetrySDK(), + resource.WithHost(), + resource.WithAttributes( + semconv.ServiceNameKey.String(serviceName), + ), + ) +} + +// initMeterProvider creates and configures the OTel MeterProvider with OTLP gRPC export. +// The periodic reader interval is set to align with Datadog Agent collection intervals. +func initMeterProvider(ctx context.Context, res *resource.Resource) (*metric.MeterProvider, error) { exporter, err := otlpmetricgrpc.New(ctx, otlpmetricgrpc.WithInsecure()) if err != nil { - log.Fatal("new otlp metric grpc exporter failed: %v", zap.Error(err)) + return nil, fmt.Errorf("failed to create OTLP metric exporter: %w", err) } - provider := metric.NewMeterProvider(metric.WithReader(metric.NewPeriodicReader(exporter)), metric.WithResource(r)) - return provider + provider := metric.NewMeterProvider( + metric.WithReader(metric.NewPeriodicReader(exporter, + metric.WithInterval(metricCollectionInterval), + )), + metric.WithResource(res), + ) + return provider, nil } -func initTracerProvider(ctx context.Context, r *resource.Resource) *sdktrace.TracerProvider { - // Create exporter. +// initTracerProvider creates and configures the OTel TracerProvider with OTLP gRPC export. +func initTracerProvider(ctx context.Context, res *resource.Resource) (*sdktrace.TracerProvider, error) { exporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithInsecure()) if err != nil { - log.Fatalf("failed to construct new exporter: ", err) + return nil, fmt.Errorf("failed to create OTLP trace exporter: %w", err) } - // Create tracer provider. tp := sdktrace.NewTracerProvider( sdktrace.WithBatcher(exporter), - sdktrace.WithResource(r), + sdktrace.WithResource(res), ) - // Set tracer provider and propagator. otel.SetTracerProvider(tp) - otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator(propagation.TraceContext{}, propagation.Baggage{})) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) - return tp + return tp, nil +} + +// registerContainerMetrics creates and records container metrics that correlate with +// Datadog container monitoring. These metric names map to Datadog's container metric +// namespace so that trace-to-container-metrics correlation works in the Datadog UI. +// +// Datadog metric mapping (OTel -> DD): +// - container.cpu.usage (Gauge) -> container.cpu.usage +// - container.cpu.limit (Gauge) -> container.cpu.limit +// - container.cpu.user (Gauge) -> container.cpu.user +// - container.cpu.system (Gauge) -> container.cpu.system +// - container.memory.rss (Gauge) -> container.memory.rss +// - container.memory.usage (Gauge) -> container.memory.usage +// - container.memory.limit (Gauge) -> container.memory.limit +// - container.io.read (Gauge) -> container.io.read +// - container.io.write (Gauge) -> container.io.write +// - container.net.sent (Gauge) -> container.net.sent +// - container.net.rcvd (Gauge) -> container.net.rcvd +// +// Note: These metrics use Float64Gauge because they represent point-in-time measurements +// of container resource usage, not cumulative counters. In the OTel spec, Gauge is the +// correct instrument for non-additive values that represent current state. +// UpDownCounter was previously used but is semantically incorrect for these measurements +// because container resource values are absolute readings, not deltas. +// +// The container.name and container.id attributes are required for Datadog to correlate +// these metrics with the correct container in the trace view. +func registerContainerMetrics(ctx context.Context, meter otelmetric.Meter) error { + containerName := getEnvOrDefault("OTEL_CONTAINER_NAME", "manual-container-metrics-app") + containerID := os.Getenv("OTEL_K8S_CONTAINER_ID") + if containerID == "" { + log.Warn("OTEL_K8S_CONTAINER_ID is not set; container metrics correlation may not work") + } + + // Attributes required for Datadog container metrics correlation. + // container.name and container.id must match the actual container for DD to correlate. + attrs := otelmetric.WithAttributes( + attribute.String("container.name", containerName), + attribute.String("container.id", containerID), + ) + + // CPU metrics -- these are point-in-time gauge values representing current CPU state. + // DD maps these to container.cpu.* metrics in the container monitoring view. + type metricDef struct { + name string + unit string + desc string + } + + gaugeMetrics := []metricDef{ + {"container.cpu.usage", "ns", "Total CPU usage of the container in nanoseconds"}, + {"container.cpu.limit", "{cpus}", "CPU limit assigned to the container"}, + {"container.cpu.user", "ns", "User CPU time consumed by the container in nanoseconds"}, + {"container.cpu.system", "ns", "System CPU time consumed by the container in nanoseconds"}, + {"container.memory.rss", "By", "Resident set size (RSS) memory of the container in bytes"}, + {"container.memory.usage", "By", "Total memory usage of the container in bytes"}, + {"container.memory.limit", "By", "Memory limit of the container in bytes"}, + {"container.io.read", "By", "Bytes read from disk by the container"}, + {"container.io.write", "By", "Bytes written to disk by the container"}, + {"container.net.sent", "By", "Bytes sent over the network by the container"}, + {"container.net.rcvd", "By", "Bytes received over the network by the container"}, + } + + for _, m := range gaugeMetrics { + gauge, err := meter.Float64Gauge(m.name, + otelmetric.WithDescription(m.desc), + otelmetric.WithUnit(m.unit), + ) + if err != nil { + return fmt.Errorf("failed to create gauge %s: %w", m.name, err) + } + // Record an initial value so the metric is registered with the collector. + gauge.Record(ctx, 1, attrs) + } + + return nil } func main() { initLogger() - ctx := context.Background() - // Create resource. - res, err := resource.New(ctx, resource.WithFromEnv()) + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + // Create resource with service identity and environment attributes. + res, err := initResource(ctx) if err != nil { - log.Fatalf("failed to create resource: ", err) + log.Fatalf("Failed to create OTel resource: %v", err) + } + + // Initialize tracer provider. + tp, err := initTracerProvider(ctx, res) + if err != nil { + log.Fatalf("Failed to initialize tracer provider: %v", err) } - tp := initTracerProvider(ctx, res) defer func() { - if err := tp.Shutdown(ctx); err != nil { - log.Error("Error shutting down tracer provider: ", err) + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer shutdownCancel() + if err := tp.Shutdown(shutdownCtx); err != nil { + log.Errorf("Error shutting down tracer provider: %v", err) } }() - mp := initMeter(ctx, res) + + // Initialize meter provider. + mp, err := initMeterProvider(ctx, res) + if err != nil { + log.Fatalf("Failed to initialize meter provider: %v", err) + } defer func() { - if err := mp.Shutdown(ctx); err != nil { - log.Error("Error shutting down meter provider: ", err) + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer shutdownCancel() + if err := mp.Shutdown(shutdownCtx); err != nil { + log.Errorf("Error shutting down meter provider: %v", err) } }() - meter := mp.Meter(os.Getenv("OTEL_SERVICE_NAME")) - containerCpuUsage, err := meter.Float64UpDownCounter("container.cpu.usage") - containerCpuLimit, err := meter.Float64UpDownCounter("container.cpu.limit") - containerCpuUser, err := meter.Float64UpDownCounter("container.cpu.user") - containerCpuSystem, err := meter.Float64UpDownCounter("container.cpu.system") - containerMemoryRss, err := meter.Float64UpDownCounter("container.memory.rss") - containerMemoryUsage, err := meter.Float64UpDownCounter("container.memory.usage") - containerMemoryLimit, err := meter.Float64UpDownCounter("container.memory.limit") - containerIoRead, err := meter.Float64UpDownCounter("container.io.read") - containerIoWrite, err := meter.Float64UpDownCounter("container.io.write") - containerNetSent, err := meter.Float64UpDownCounter("container.net.sent") - containerNetRcvd, err := meter.Float64UpDownCounter("container.net.rcvd") - - attr := metric2.WithAttributes(attribute.String("container.name", os.Getenv("OTEL_CONTAINER_NAME")), attribute.String("container.id", os.Getenv("OTEL_K8S_CONTAINER_ID"))) - containerCpuUsage.Add(ctx, 1, attr) - containerCpuLimit.Add(ctx, 1, attr) - containerCpuUser.Add(ctx, 1, attr) - containerCpuSystem.Add(ctx, 1, attr) - containerMemoryRss.Add(ctx, 1, attr) - containerMemoryUsage.Add(ctx, 1, attr) - containerMemoryLimit.Add(ctx, 1, attr) - containerIoRead.Add(ctx, 1, attr) - containerIoWrite.Add(ctx, 1, attr) - containerNetSent.Add(ctx, 1, attr) - containerNetRcvd.Add(ctx, 1, attr) - - // Start HTTP server - mux := SetupHandlers() - err = http.ListenAndServe(":3000", mux) - if err != nil { - log.Fatal(err) + // Create a meter for container metrics. + serviceName := getEnvOrDefault("OTEL_SERVICE_NAME", "manual-container-metrics-app") + meter := mp.Meter(serviceName) + + // Register container metrics for Datadog correlation. + if err := registerContainerMetrics(ctx, meter); err != nil { + log.Fatalf("Failed to register container metrics: %v", err) + } + log.Info("Container metrics registered successfully") + + // Set up HTTP server with OTel-instrumented handlers. + mux := setupHandlers() + server := &http.Server{ + Addr: serverPort, + Handler: mux, + ReadTimeout: 10 * time.Second, + WriteTimeout: 10 * time.Second, + IdleTimeout: 60 * time.Second, + } + + // Start server in a goroutine so we can handle graceful shutdown. + errCh := make(chan error, 1) + go func() { + log.Infof("Starting server on %s", serverPort) + if err := server.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + errCh <- err + } + close(errCh) + }() + + // Wait for shutdown signal or server error. + select { + case err := <-errCh: + log.Fatalf("Server error: %v", err) + case <-ctx.Done(): + log.Info("Received shutdown signal, shutting down gracefully...") + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer shutdownCancel() + if err := server.Shutdown(shutdownCtx); err != nil { + log.Errorf("Error shutting down server: %v", err) + } } } -func SetupHandlers() *http.ServeMux { +// setupHandlers registers HTTP handlers with OTel instrumentation. +func setupHandlers() *http.ServeMux { mux := http.NewServeMux() + mux.Handle("/health", otelhttp.NewHandler(http.HandlerFunc(HealthHandler), "HealthHandler")) mux.Handle("/readiness", otelhttp.NewHandler(http.HandlerFunc(ReadinessHandler), "ReadinessHandler")) mux.Handle("/liveness", otelhttp.NewHandler(http.HandlerFunc(LivenessHandler), "LivenessHandler")) return mux } +// HealthHandler provides a basic health check endpoint for monitoring. +func HealthHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]string{"status": "healthy"}) +} + +// ReadinessHandler indicates whether the application is ready to receive traffic. func ReadinessHandler(w http.ResponseWriter, r *http.Request) { span := trace.SpanFromContext(r.Context()) log.WithFields(log.Fields{ "dd.trace_id": span.SpanContext().TraceID().String(), "dd.span_id": span.SpanContext().SpanID().String(), "service": os.Getenv("OTEL_SERVICE_NAME"), - }).Info("Work is being done in readiness handler") - io.WriteString(w, "Log has been injected with trace_id and span_id!\n") + }).Info("Readiness check") + w.Header().Set("Content-Type", "application/json") resp := struct { Readiness bool `json:"readiness"` }{ Readiness: true, } - json.NewEncoder(w).Encode(resp) } +// LivenessHandler indicates whether the application is running. func LivenessHandler(w http.ResponseWriter, r *http.Request) { span := trace.SpanFromContext(r.Context()) log.WithFields(log.Fields{ "dd.trace_id": span.SpanContext().TraceID().String(), "dd.span_id": span.SpanContext().SpanID().String(), "service": os.Getenv("OTEL_SERVICE_NAME"), - }).Info("Work is being done in liveness handler") - io.WriteString(w, "Log has been injected with trace_id and span_id!\n") + }).Info("Liveness check") + w.Header().Set("Content-Type", "application/json") resp := struct { Liveness bool `json:"liveness"` }{ Liveness: true, } - json.NewEncoder(w).Encode(resp) } + +// getEnvOrDefault returns the value of the environment variable named by key, +// or the provided default value if the variable is not set. +func getEnvOrDefault(key, defaultVal string) string { + if val := os.Getenv(key); val != "" { + return val + } + return defaultVal +} diff --git a/apps/manual-container-metrics/values.yaml b/apps/manual-container-metrics/values.yaml index 073635c..d7dbf32 100644 --- a/apps/manual-container-metrics/values.yaml +++ b/apps/manual-container-metrics/values.yaml @@ -3,10 +3,10 @@ kind: Service metadata: name: manual-container-metrics-app labels: - helm.sh/chart: manual-container-metrics-app-0.1.0 + helm.sh/chart: manual-container-metrics-app-0.2.0 app.kubernetes.io/name: manual-container-metrics-app app.kubernetes.io/instance: manual-container-metrics-app - app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/version: "2.0.0" app.kubernetes.io/managed-by: Helm spec: type: ClusterIP @@ -24,10 +24,10 @@ kind: Deployment metadata: name: manual-container-metrics-app labels: - helm.sh/chart: manual-container-metrics-app-0.1.0 + helm.sh/chart: manual-container-metrics-app-0.2.0 app.kubernetes.io/name: manual-container-metrics-app app.kubernetes.io/instance: manual-container-metrics-app - app.kubernetes.io/version: "1.16.0" + app.kubernetes.io/version: "2.0.0" app.kubernetes.io/managed-by: Helm spec: replicas: 1 @@ -41,35 +41,65 @@ spec: app.kubernetes.io/name: manual-container-metrics-app app.kubernetes.io/instance: manual-container-metrics-app spec: + # Graceful shutdown: allow time for OTel providers to flush. + terminationGracePeriodSeconds: 30 securityContext: - {} + runAsNonRoot: true + runAsUser: 65534 + fsGroup: 65534 containers: - name: manual-container-metrics-app securityContext: - {} - image: "datadog/opentelemetry-examples:manual-container-metrics-app-v1.0.5" + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + image: "datadog/opentelemetry-examples:manual-container-metrics-app-v2.0.0" imagePullPolicy: IfNotPresent ports: - name: http containerPort: 3000 protocol: TCP + # Health check endpoint for Kubernetes probes. livenessProbe: httpGet: path: /liveness port: http + initialDelaySeconds: 5 + periodSeconds: 15 + timeoutSeconds: 3 + failureThreshold: 3 readinessProbe: httpGet: path: /readiness port: http + initialDelaySeconds: 3 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + startupProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 2 + periodSeconds: 5 + failureThreshold: 10 env: + # -- Service identity -- - name: OTEL_SERVICE_NAME value: manual-container-metrics-app + # -- Container identity for Datadog metrics correlation -- + # OTEL_CONTAINER_NAME must match the Kubernetes container name for DD correlation. - name: OTEL_CONTAINER_NAME value: manual-container-metrics-app + # OTEL_K8S_CONTAINER_ID is set to the pod UID; Datadog uses this to correlate + # container metrics with traces in the APM container metrics view. - name: OTEL_K8S_CONTAINER_ID valueFrom: fieldRef: fieldPath: metadata.uid + # -- Kubernetes metadata for resource attributes -- - name: OTEL_K8S_NAMESPACE valueFrom: fieldRef: @@ -85,6 +115,8 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.name + # -- OTLP exporter configuration -- + # HOST_IP points to the node where the OTel Collector / DD Agent is running. - name: HOST_IP valueFrom: fieldRef: @@ -95,6 +127,13 @@ spec: value: 'http://$(HOST_IP):$(OTLP_GRPC_PORT)' - name: OTEL_EXPORTER_OTLP_PROTOCOL value: grpc + # -- OTel resource attributes -- + # These attributes are used by the Datadog exporter to tag metrics and traces. + # service.name: Used as the DD service tag. + # k8s.namespace.name: Used for environment correlation. + # k8s.pod.name / k8s.node.name: Used for infrastructure correlation. + # container.name / container.id: Required for container metrics correlation. + # deployment.environment: Maps to DD's env tag. - name: OTEL_RESOURCE_ATTRIBUTES value: >- service.name=$(OTEL_SERVICE_NAME), @@ -107,9 +146,9 @@ spec: container.name=$(OTEL_CONTAINER_NAME), container.id=$(OTEL_K8S_CONTAINER_ID) resources: - limits: - cpu: 1 - memory: 1Gi requests: - cpu: 1 - memory: 1Gi + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi