Add a health check endpoint (#36)

chris-brindley · web-flow · commit fe86d2069bdf · 2022-01-05T11:46:09.000Z
* Update profiler and configuration to serve on a non-default mux
* Add a healthcheck handler, configuration, and HTTP endpoint
* Update example config and README with healthcheck configuration
diff --git a/README.md b/README.md
@@ -111,6 +111,8 @@ line options.
 | DATASET_FILTER | --dataset-filter | BigQuery label to filter datasets for metric collection |
 | GCP_PROJECT_ID | --gcp-project-id | (Required) The Google Cloud project containing the BigQuery tables to retrieve metrics from |
 | GOOGLE_APPLICATION_CREDENTIALS | | File containing service account details to authenticate to Google Cloud using |
+| HEALTHCHECK_ENABLED | --healthcheck.enabled | Whether to enable the health check endpoint at /health. Defaults to *false* |
+| HEALTHCHECK_PORT | --healthcheck.port | The port to run the health check server on. Defaults to *8080* | 
 | LOG_LEVEL | | The logging level (e.g. trace, debug, info, warn, error). Defaults to *info* |
 | METRIC_INTERVAL | --metric-interval | The interval between metric collection rounds. Must contain a unit and valid units are "ns", "us" (or "µs"), "ms", "s", "m", "h". Defaults to *30s* |
 | METRIC_PREFIX | --metric-prefix | The prefix for the metric names exported to Datadog. Defaults to *custom.gcp.bigquery* |
diff --git a/cmd/bqmetricsd/main.go b/cmd/bqmetricsd/main.go
@@ -5,10 +5,11 @@ import (
 	"fmt"
 	"github.com/ovotech/bigquery-metrics-extractor/pkg/config"
 	"github.com/ovotech/bigquery-metrics-extractor/pkg/daemon"
+	"github.com/ovotech/bigquery-metrics-extractor/pkg/health"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 	"net/http"
-	_ "net/http/pprof"
+	"net/http/pprof"
 	"os"
 	"os/signal"
 )
@@ -22,12 +23,33 @@ func main() {
 		log.Fatal().Err(err).Msg("Failed to parse config")
 	}
 
-	if cfg.Profiling {
-		addr := "localhost:6060"
+	if cfg.Profiler.Enabled {
+		addr := fmt.Sprintf("localhost:%d", cfg.Profiler.Port)
 		log.Info().Msgf("Running profiler on %s", addr)
 
+		mux := http.NewServeMux()
+		mux.HandleFunc("/debug/pprof/", pprof.Index)
+		mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
+		mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
+		mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
+		mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
+
+		go func() {
+			log.Err(http.ListenAndServe(addr, mux)).Msg("Shutting down profiler")
+		}()
+	}
+
+	if cfg.HealthCheck.Enabled {
+		addr := fmt.Sprintf("localhost:%d", cfg.HealthCheck.Port)
+		log.Info().Msgf("Running healthcheck server on %s", addr)
+
+		healthsrv := health.ServiceStatus{Status: health.Ok}
+
+		mux := http.NewServeMux()
+		mux.HandleFunc("/health", healthsrv.Handler)
+
 		go func() {
-			log.Err(http.ListenAndServe(addr, nil)).Msg("Shutting down profiler")
+			log.Err(http.ListenAndServe(addr, mux)).Msg("Shutting down healthcheck server")
 		}()
 	}
 
diff --git a/example-config.yaml b/example-config.yaml
@@ -57,3 +57,11 @@
 #       SELECT APPROX_COUNT_DISTINCT(`my-column-1`) AS `my-column-1`,
 #              APPROX_COUNT_DISTINCT(`my-column-2`) AS `my-column-2`
 #       FROM `my-project.my-dataset.my-table`
+
+###
+# Configuration for the healthcheck endpoint, used to determine whether the
+# service is healthy or not
+#
+# healthcheck:
+#   enabled: true
+#   port: 8080
diff --git a/pkg/config/config.go b/pkg/config/config.go
@@ -44,7 +44,8 @@ type Config struct {
 	MetricTags     []string       `viper:"metric-tags"`
 	MetricInterval time.Duration  `viper:"metric-interval"`
 	CustomMetrics  []CustomMetric `viper:"custom-metrics"`
-	Profiling      bool           `viper:"enable-profiler"`
+	Profiler       Profiler       `viper:"profiler"`
+	HealthCheck    HealthCheck    `viper:"healthcheck"`
 }
 
 // CustomMetric holds details about a metric generated from an SQL query
@@ -55,6 +56,18 @@ type CustomMetric struct {
 	SQL            string        `viper:"sql"`
 }
 
+// Profiler holds configuration details for the profiler
+type Profiler struct {
+	Enabled bool `viper:"enabled"`
+	Port    int  `viper:"port"`
+}
+
+// HealthCheck holds configuration details for the health endpoint
+type HealthCheck struct {
+	Enabled bool `viper:"enabled"`
+	Port    int  `viper:"port"`
+}
+
 // NewConfig creates a config struct using the package viper for configuration
 // construction. Configuration can either be passed in a config file, as flags
 // when running the application, or as environment variables. Priority is as
@@ -150,6 +163,18 @@ func ValidateConfig(c *Config) error {
 		}
 	}
 
+	if c.HealthCheck.Enabled {
+		if c.HealthCheck.Port <= 0 || c.HealthCheck.Port > 65535 {
+			return ErrInvalidPort
+		}
+	}
+
+	if c.Profiler.Enabled {
+		if c.Profiler.Port <= 0 || c.Profiler.Port > 65535 {
+			return ErrInvalidPort
+		}
+	}
+
 	return nil
 }
 
@@ -176,7 +201,10 @@ func configFlags(name string) *pflag.FlagSet {
 	flags.String("metric-prefix", DefaultMetricPrefix, fmt.Sprintf("The prefix for the metrics names exported to Datadog (Default %s)", DefaultMetricPrefix))
 	flags.Duration("metric-interval", defInterval, fmt.Sprintf("The interval between metrics submissions (Default %s)", DefaultMetricInterval))
 	flags.StringSlice("metric-tags", []string{}, "Comma-delimited list of tags to attach to metrics")
-	flags.Bool("enable-profiler", false, "Enables the profiler")
+	flags.Bool("profiler.enabled", false, "Enables the profiler")
+	flags.Int("profiler.port", 6060, "The port on which to run the profiler server")
+	flags.Bool("healthcheck.enabled", false, "Enables the health check endpoint")
+	flags.Int("healthcheck.port", 8080, "The port on which to run the server providing the health check endpoint")
 
 	_ = flags.Parse(os.Args[1:])
 
@@ -255,6 +283,7 @@ func handleEnvBindings(vpr *viper.Viper, fs *pflag.FlagSet) {
 
 	fs.VisitAll(func(f *pflag.Flag) {
 		env := strings.ReplaceAll(f.Name, "-", "_")
+		env = strings.ReplaceAll(env, ".", "_")
 		_ = vpr.BindEnv(f.Name, strings.ToUpper(env))
 	})
 }
diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go
@@ -41,47 +41,52 @@ func TestNewConfig(t *testing.T) {
 		want    *Config
 		wantErr bool
 	}{
-		{"all via env", setup([]string{"DATADOG_API_KEY=abc123", "DATASET_FILTER=bqmetrics:enabled", "GCP_PROJECT_ID=my-project-id", "METRIC_PREFIX=custom.gcp.bigquery.stats", "METRIC_TAGS=env:prod", "METRIC_INTERVAL=2m"}, nil, ""), args{"bqmetricstest"}, &Config{
+		{"all via env", setup([]string{"DATADOG_API_KEY=abc123", "DATASET_FILTER=bqmetrics:enabled", "GCP_PROJECT_ID=my-project-id", "METRIC_PREFIX=custom.gcp.bigquery.stats", "METRIC_TAGS=env:prod", "METRIC_INTERVAL=2m", "HEALTHCHECK_ENABLED=true"}, nil, ""), args{"bqmetricstest"}, &Config{
 			DatadogAPIKey:  "abc123",
 			DatasetFilter:  "bqmetrics:enabled",
 			GcpProject:     "my-project-id",
 			MetricPrefix:   "custom.gcp.bigquery.stats",
 			MetricTags:     []string{"env:prod"},
 			MetricInterval: 2 * time.Minute,
-			Profiling:      false,
+			Profiler:       Profiler{false, 6060},
+			HealthCheck:    HealthCheck{true, 8080},
 		}, false},
-		{"all via cmd", setup(nil, []string{"--datadog-api-key-file=/tmp/dd.key", "--dataset-filter=bqmetrics:enabled", "--gcp-project-id=my-project-id", "--metric-prefix=custom.gcp.bigquery.stats", "--metric-tags=env:prod", "--metric-interval=2m", "--enable-profiler"}, "abc123"), args{"bqmetricstest"}, &Config{
+		{"all via cmd", setup(nil, []string{"--datadog-api-key-file=/tmp/dd.key", "--dataset-filter=bqmetrics:enabled", "--gcp-project-id=my-project-id", "--metric-prefix=custom.gcp.bigquery.stats", "--metric-tags=env:prod", "--metric-interval=2m", "--profiler.enabled"}, "abc123"), args{"bqmetricstest"}, &Config{
 			DatadogAPIKey:  "abc123",
 			DatasetFilter:  "bqmetrics:enabled",
 			GcpProject:     "my-project-id",
 			MetricPrefix:   "custom.gcp.bigquery.stats",
 			MetricTags:     []string{"env:prod"},
 			MetricInterval: 2 * time.Minute,
-			Profiling:      true,
+			Profiler:       Profiler{true, 6060},
+			HealthCheck:    HealthCheck{false, 8080},
 		}, false},
 		{"mixture of sources", setup([]string{"DATADOG_API_KEY=abc123", "GCP_PROJECT_ID=my-project-id"}, []string{"--metric-prefix=custom.gcp.bigquery.stats", "--metric-tags=env:prod", "--metric-interval=2m"}, ""), args{"bqmetricstest"}, &Config{
 			DatadogAPIKey:  "abc123",
 			GcpProject:     "my-project-id",
 			MetricPrefix:   "custom.gcp.bigquery.stats",
 			MetricTags:     []string{"env:prod"},
 			MetricInterval: 2 * time.Minute,
-			Profiling:      false,
+			Profiler:       Profiler{false, 6060},
+			HealthCheck:    HealthCheck{false, 8080},
 		}, false},
 		{"minimum required config", setup([]string{"DATADOG_API_KEY=abc123", "GCP_PROJECT_ID=my-project-id"}, nil, ""), args{"bqmetricstest"}, &Config{
 			DatadogAPIKey:  "abc123",
 			GcpProject:     "my-project-id",
 			MetricPrefix:   DefaultMetricPrefix,
 			MetricTags:     nil,
 			MetricInterval: 30 * time.Second,
-			Profiling:      false,
+			Profiler:       Profiler{false, 6060},
+			HealthCheck:    HealthCheck{false, 8080},
 		}, false},
 		{"default credentials", setup([]string{"DATADOG_API_KEY=abc123", "GOOGLE_APPLICATION_CREDENTIALS=/tmp/dd.key"}, nil, "{\"type\": \"service_account\", \"project_id\": \"my-project-id\"}"), args{"bqmetricstest"}, &Config{
 			DatadogAPIKey:  "abc123",
 			GcpProject:     "my-project-id",
 			MetricPrefix:   DefaultMetricPrefix,
 			MetricTags:     nil,
 			MetricInterval: 30 * time.Second,
-			Profiling:      false,
+			Profiler:       Profiler{false, 6060},
+			HealthCheck:    HealthCheck{false, 8080},
 		}, false},
 		{"unreadable key file", setup([]string{"DATADOG_API_KEY_FILE=/tmp/not-found.key", "GCP_PROJECT_ID=my-project-id"}, nil, "abc123"), args{"bqmetricstest"}, nil, true},
 		{"missing key", setup([]string{"GCP_PROJECT_ID=my-project-id"}, nil, ""), args{"bqmetricstest"}, nil, true},
@@ -113,7 +118,7 @@ func TestNewConfig_configFile(t *testing.T) {
 		_ = os.Remove(n)
 	}()
 
-	data := []byte("{\"datadog-api-key\": \"abc123\", \"dataset-filter\": \"bqmetrics:enabled\", \"gcp-project-id\": \"my-project-id\", \"metric-prefix\": \"custom.gcp.bigquery.stats\", \"metric-tags\": \"env:prod,team:my-team\", \"metric-interval\": \"2m\"}")
+	data := []byte("{\"datadog-api-key\": \"abc123\", \"dataset-filter\": \"bqmetrics:enabled\", \"gcp-project-id\": \"my-project-id\", \"metric-prefix\": \"custom.gcp.bigquery.stats\", \"metric-tags\": \"env:prod,team:my-team\", \"metric-interval\": \"2m\", \"healthcheck\": {\"enabled\": true, \"port\": 8081}}")
 	if _, err = f.Write(data); err != nil {
 		t.Fatalf("error when writing test config file: %s", err)
 	}
@@ -126,7 +131,8 @@ func TestNewConfig_configFile(t *testing.T) {
 		MetricPrefix:   "custom.gcp.bigquery.stats",
 		MetricTags:     []string{"env:prod", "team:my-team"},
 		MetricInterval: 2 * time.Minute,
-		Profiling:      false,
+		Profiler:       Profiler{false, 6060},
+		HealthCheck:    HealthCheck{true, 8081},
 	}
 
 	got, err := NewConfig("bqmetricstest")
@@ -186,13 +192,14 @@ func TestNewConfig_configFileWithCustomQueries(t *testing.T) {
 		MetricPrefix:   "custom.gcp.bigquery.stats",
 		MetricTags:     []string{"env:prod", "team:my-team"},
 		MetricInterval: 2 * time.Minute,
-		Profiling:      false,
+		Profiler:       Profiler{false, 6060},
 		CustomMetrics: []CustomMetric{{
 			MetricName:     "my_metric",
 			MetricTags:     []string{"table_id:table"},
 			MetricInterval: 2 * time.Minute,
 			SQL:            "SELECT COUNT(DISTINCT *) FROM `table`",
 		}},
+		HealthCheck: HealthCheck{false, 8080},
 	}
 
 	got, err := NewConfig("bqmetricstest")
@@ -262,7 +269,7 @@ func TestValidateConfig(t *testing.T) {
 			MetricPrefix:   "custom.gcp.bigquery.stats",
 			MetricTags:     []string{"env:prod"},
 			MetricInterval: time.Duration(30000),
-			Profiling:      false,
+			Profiler:       Profiler{false, 6060},
 		}}, false},
 		{"custom metrics okay", args{&Config{
 			DatadogAPIKey:  "abc123",
@@ -276,6 +283,7 @@ func TestValidateConfig(t *testing.T) {
 				MetricInterval: time.Duration(36000000),
 				SQL:            "SELECT COUNT(DISTINCT `my-column`) FROM `my-dataset.my-table`",
 			}},
+			HealthCheck: HealthCheck{false, 8080},
 		}}, false},
 		{"custom metrics missing name", args{&Config{
 			DatadogAPIKey:  "abc123",
@@ -299,6 +307,22 @@ func TestValidateConfig(t *testing.T) {
 				MetricInterval: time.Duration(36000000),
 			}},
 		}}, true},
+		{"health check disabled", args{&Config{
+			DatadogAPIKey:  "abc123",
+			GcpProject:     "my-project-id",
+			MetricPrefix:   "custom.gcp.bigquery.stats",
+			MetricTags:     []string{"env:prod"},
+			MetricInterval: time.Duration(30000),
+			HealthCheck:    HealthCheck{false, 8080},
+		}}, false},
+		{"health check port invalid", args{&Config{
+			DatadogAPIKey:  "abc123",
+			GcpProject:     "my-project-id",
+			MetricPrefix:   "custom.gcp.bigquery.stats",
+			MetricTags:     []string{"env:prod"},
+			MetricInterval: time.Duration(30000),
+			HealthCheck:    HealthCheck{true, -8080},
+		}}, true},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
diff --git a/pkg/config/errors.go b/pkg/config/errors.go
@@ -20,4 +20,7 @@ var (
 
 	// ErrMissingCustomMetricSQL is the error returned when a CustomMetric is missing SQL
 	ErrMissingCustomMetricSQL = errors.New("no custom metric sql query configured")
+
+	// ErrInvalidPort is the error returned when an invalid port is specified
+	ErrInvalidPort = errors.New("invalid port specified")
 )
diff --git a/pkg/health/http.go b/pkg/health/http.go
@@ -0,0 +1,44 @@
+package health
+
+import (
+	"encoding/json"
+	"github.com/rs/zerolog/log"
+	"net/http"
+)
+
+// Status is the health status of the service
+type Status string
+
+const (
+	// Ok status means that the service is operating nominally
+	Ok    = Status("OK")
+
+	// Error status means that something is going wrong with the service
+	Error = Status("Error")
+)
+
+// ServiceStatus holds information about the health of the bqmetricsd service
+type ServiceStatus struct {
+	Status Status `json:"status"`
+}
+
+// Handler will handle HTTP requests to the health endpoint
+func (hs ServiceStatus) Handler(w http.ResponseWriter, _ *http.Request) {
+	data, err := json.Marshal(hs)
+	if err != nil {
+		http.Error(w, err.Error(), http.StatusInternalServerError)
+		return
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	switch hs.Status {
+	case Ok:
+		w.WriteHeader(http.StatusOK)
+	default:
+		w.WriteHeader(http.StatusInternalServerError)
+	}
+
+	if _, err = w.Write(data); err != nil {
+		log.Err(err).Msg("error when writing health http response")
+	}
+}
diff --git a/pkg/health/http_test.go b/pkg/health/http_test.go
@@ -0,0 +1,52 @@
+package health
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"testing"
+)
+
+func TestServiceStatus_Handler(t *testing.T) {
+	type fields struct {
+		Status Status
+	}
+	type want struct {
+		status int
+		body   string
+	}
+	tests := []struct {
+		name   string
+		fields fields
+		want   want
+	}{
+		{"health ok", fields{Status: Ok}, want{200, "{\"status\":\"OK\"}"}},
+		{"health fail", fields{Status: Error}, want{500, "{\"status\":\"Error\"}"}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			hs := ServiceStatus{
+				Status: tt.fields.Status,
+			}
+
+			req, err := http.NewRequest("GET", "/health", nil)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			rr := httptest.NewRecorder()
+			handler := http.HandlerFunc(hs.Handler)
+
+			handler.ServeHTTP(rr, req)
+
+			if rr.Code != tt.want.status {
+				t.Errorf("handler returned wrong status code: got %v want %v",
+					rr.Code, tt.want.status)
+			}
+
+			if rr.Body.String() != tt.want.body {
+				t.Errorf("handler returned unexpected body: got %v want %v",
+					rr.Body.String(), tt.want.body)
+			}
+		})
+	}
+}

Original file line number	Diff line number	Diff line change
`@@ -20,4 +20,7 @@ var (`
`20`	`20`
`21`	`21`	`// ErrMissingCustomMetricSQL is the error returned when a CustomMetric is missing SQL`
`22`	`22`	`ErrMissingCustomMetricSQL = errors.New("no custom metric sql query configured")`
	`23`	`+`
	`24`	`+ // ErrInvalidPort is the error returned when an invalid port is specified`
	`25`	`+ ErrInvalidPort = errors.New("invalid port specified")`
`23`	`26`	`)`