Skip to content

Commit 53ccb25

Browse files
craig[bot]spilchen
andcommitted
Merge #156384
156384: sql/inspect: record spans processed metric r=spilchen a=spilchen This add a new INSPECT metric to track a counter of the number spans completed. This will be useful to see the rate of progress for INSPECT. Closes #155484 Epic: CRDB-55075 Release note: none Co-authored-by: Matt Spilchen <[email protected]>
2 parents a6eba0e + 8f7c8be commit 53ccb25

File tree

6 files changed

+55
-19
lines changed

6 files changed

+55
-19
lines changed

docs/generated/metrics/metrics.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5137,6 +5137,14 @@ layers:
51375137
unit: COUNT
51385138
aggregation: AVG
51395139
derivative: NON_NEGATIVE_DERIVATIVE
5140+
- name: jobs.inspect.spans_processed
5141+
exported_name: jobs_inspect_spans_processed
5142+
description: Number of spans processed by INSPECT jobs
5143+
y_axis_label: Spans
5144+
type: COUNTER
5145+
unit: COUNT
5146+
aggregation: AVG
5147+
derivative: NON_NEGATIVE_DERIVATIVE
51405148
- name: jobs.key_visualizer.currently_idle
51415149
exported_name: jobs_key_visualizer_currently_idle
51425150
labeled_name: 'jobs{type: key_visualizer, status: currently_idle}'

pkg/roachprod/agents/opentelemetry/cockroachdb_metrics.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,7 @@ var cockroachdbMetrics = map[string]string{
902902
"jobs_inspect_resume_retry_error": "jobs.inspect.resume_retry_error",
903903
"jobs_inspect_runs": "jobs.inspect.runs",
904904
"jobs_inspect_runs_with_issues": "jobs.inspect.runs_with_issues",
905+
"jobs_inspect_spans_processed": "jobs.inspect.spans_processed",
905906
"jobs_key_visualizer_currently_idle": "jobs.key_visualizer.currently_idle",
906907
"jobs_key_visualizer_currently_paused": "jobs.key_visualizer.currently_paused",
907908
"jobs_key_visualizer_currently_running": "jobs.key_visualizer.currently_running",

pkg/sql/inspect/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ go_test(
8585
"//build/toolchains:is_heavy": {"test.Pool": "large"},
8686
"//conditions:default": {"test.Pool": "default"},
8787
}),
88+
shard_count = 4,
8889
deps = [
8990
"//pkg/base",
9091
"//pkg/jobs",

pkg/sql/inspect/inspect_metrics.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ type InspectMetrics struct {
1717
Runs *metric.Counter
1818
RunsWithIssues *metric.Counter
1919
IssuesFound *metric.Counter
20+
SpansProcessed *metric.Counter
2021
}
2122

2223
var _ metric.Struct = (*InspectMetrics)(nil)
@@ -43,6 +44,12 @@ var (
4344
Measurement: "Issues",
4445
Unit: metric.Unit_COUNT,
4546
}
47+
metaInspectSpansProcessed = metric.Metadata{
48+
Name: "jobs.inspect.spans_processed",
49+
Help: "Number of spans processed by INSPECT jobs",
50+
Measurement: "Spans",
51+
Unit: metric.Unit_COUNT,
52+
}
4653
)
4754

4855
// MakeInspectMetrics instantiates the metrics for INSPECT jobs.
@@ -51,6 +58,7 @@ func MakeInspectMetrics(histogramWindow time.Duration) metric.Struct {
5158
Runs: metric.NewCounter(metaInspectRuns),
5259
RunsWithIssues: metric.NewCounter(metaInspectRunsWithIssues),
5360
IssuesFound: metric.NewCounter(metaInspectIssuesFound),
61+
SpansProcessed: metric.NewCounter(metaInspectSpansProcessed),
5462
}
5563
}
5664

pkg/sql/inspect/inspect_metrics_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,15 @@ func TestInspectMetrics(t *testing.T) {
5151
initialRuns := metrics.Runs.Count()
5252
initialRunsWithIssues := metrics.RunsWithIssues.Count()
5353
initialIssuesFound := metrics.IssuesFound.Count()
54+
initialSpansProcessed := metrics.SpansProcessed.Count()
5455

5556
// First run: no corruption, should succeed without issues
5657
runner.Exec(t, "INSPECT TABLE db.t")
5758
require.Equal(t, initialRuns+1, metrics.Runs.Count(), "Runs counter should increment")
5859
require.Equal(t, initialRunsWithIssues, metrics.RunsWithIssues.Count(), "RunsWithIssues should not increment")
5960
require.Equal(t, initialIssuesFound, metrics.IssuesFound.Count(), "IssuesFound should not increment")
61+
require.Equal(t, initialSpansProcessed+1, metrics.SpansProcessed.Count(),
62+
"SpansProcessed should increment by 1 (one secondary index i1)")
6063

6164
// Create corruption: delete a secondary index entry for row (1, 2)
6265
// This creates a "missing_secondary_index_entry" issue - the primary key exists
@@ -86,6 +89,8 @@ func TestInspectMetrics(t *testing.T) {
8689
"RunsWithIssues should increment when issues are found")
8790
require.Equal(t, initialIssuesFound+1, metrics.IssuesFound.Count(),
8891
"IssuesFound should increment when issues are detected")
92+
require.Equal(t, initialSpansProcessed+2, metrics.SpansProcessed.Count(),
93+
"SpansProcessed should increment by 2 total (1 span per INSPECT × 2 runs)")
8994

9095
// Third run: on a different clean table to verify RunsWithIssues doesn't increment again.
9196
runner.Exec(t, `
@@ -103,4 +108,6 @@ func TestInspectMetrics(t *testing.T) {
103108
"RunsWithIssues should NOT increment for successful job")
104109
require.Equal(t, initialIssuesFound+1, metrics.IssuesFound.Count(),
105110
"IssuesFound should NOT increment for successful job")
111+
require.Equal(t, initialSpansProcessed+3, metrics.SpansProcessed.Count(),
112+
"SpansProcessed should increment by 3 total (1 span per INSPECT × 3 runs)")
106113
}

pkg/sql/inspect/inspect_processor.go

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
2525
"github.com/cockroachdb/cockroach/pkg/util/hlc"
2626
"github.com/cockroachdb/cockroach/pkg/util/log"
27+
"github.com/cockroachdb/cockroach/pkg/util/metric"
2728
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
2829
"github.com/cockroachdb/cockroach/pkg/util/tracing"
2930
"github.com/cockroachdb/errors"
@@ -83,16 +84,17 @@ func getInspectQoS(sv *settings.Values) sessiondatapb.QoSLevel {
8384
type inspectCheckFactory func(asOf hlc.Timestamp) inspectCheck
8485

8586
type inspectProcessor struct {
86-
processorID int32
87-
flowCtx *execinfra.FlowCtx
88-
spec execinfrapb.InspectSpec
89-
cfg *execinfra.ServerConfig
90-
checkFactories []inspectCheckFactory
91-
spanSrc spanSource
92-
loggerBundle *inspectLoggerBundle
93-
concurrency int
94-
clock *hlc.Clock
95-
mu struct {
87+
processorID int32
88+
flowCtx *execinfra.FlowCtx
89+
spec execinfrapb.InspectSpec
90+
cfg *execinfra.ServerConfig
91+
checkFactories []inspectCheckFactory
92+
spanSrc spanSource
93+
loggerBundle *inspectLoggerBundle
94+
concurrency int
95+
clock *hlc.Clock
96+
spansProcessedCtr *metric.Counter
97+
mu struct {
9698
// Guards calls to output.Push because DistSQLReceiver.Push is not
9799
// concurrency safe and progress metadata can be emitted from multiple
98100
// worker goroutines.
@@ -365,6 +367,9 @@ func (p *inspectProcessor) getTimestampForSpan() hlc.Timestamp {
365367
func (p *inspectProcessor) sendSpanCompletionProgress(
366368
ctx context.Context, output execinfra.RowReceiver, completedSpan roachpb.Span, finished bool,
367369
) error {
370+
if p.spansProcessedCtr != nil {
371+
p.spansProcessedCtr.Inc(1)
372+
}
368373
progressMsg := &jobspb.InspectProcessorProgress{
369374
ChecksCompleted: 0, // No additional checks completed, just marking span done
370375
Finished: finished,
@@ -414,17 +419,23 @@ func newInspectProcessor(
414419
if err != nil {
415420
return nil, err
416421
}
422+
var spansProcessedCtr *metric.Counter
423+
if execCfg, ok := flowCtx.Cfg.ExecutorConfig.(*sql.ExecutorConfig); ok {
424+
inspectMetrics := execCfg.JobRegistry.MetricsStruct().Inspect.(*InspectMetrics)
425+
spansProcessedCtr = inspectMetrics.SpansProcessed
426+
}
417427

418428
return &inspectProcessor{
419-
spec: spec,
420-
processorID: processorID,
421-
flowCtx: flowCtx,
422-
checkFactories: checkFactories,
423-
cfg: flowCtx.Cfg,
424-
spanSrc: newSliceSpanSource(spec.Spans),
425-
loggerBundle: getInspectLogger(flowCtx, spec.JobID),
426-
concurrency: getProcessorConcurrency(flowCtx),
427-
clock: flowCtx.Cfg.DB.KV().Clock(),
429+
spec: spec,
430+
processorID: processorID,
431+
flowCtx: flowCtx,
432+
checkFactories: checkFactories,
433+
cfg: flowCtx.Cfg,
434+
spanSrc: newSliceSpanSource(spec.Spans),
435+
loggerBundle: getInspectLogger(flowCtx, spec.JobID),
436+
concurrency: getProcessorConcurrency(flowCtx),
437+
clock: flowCtx.Cfg.DB.KV().Clock(),
438+
spansProcessedCtr: spansProcessedCtr,
428439
}, nil
429440
}
430441

0 commit comments

Comments
 (0)