Skip to content

Commit 1b10efe

Browse files
authored
Skip corrupt blocks on shipper (#6786)
* Skip corrupt blocks on shipper Signed-off-by: Daniel Deluiggi <[email protected]> * changelog Signed-off-by: Daniel Deluiggi <[email protected]> --------- Signed-off-by: Daniel Deluiggi <[email protected]>
1 parent 103b264 commit 1b10efe

File tree

8 files changed

+201
-70
lines changed

8 files changed

+201
-70
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
* [BUGFIX] Add `__markers__` tenant ID validation. #6761
5151
* [BUGFIX] Ring: Fix nil pointer exception when token is shared. #6768
5252
* [BUGFIX] Fix race condition in active user. #6773
53+
* [BUGFIX] Ingester: Allow shipper to skip corrupted blocks. #6786
5354

5455
## 1.19.0 2025-02-27
5556

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ require (
5252
github.com/stretchr/testify v1.10.0
5353
github.com/thanos-io/objstore v0.0.0-20250317105316-a0136a6f898d
5454
github.com/thanos-io/promql-engine v0.0.0-20250522103302-dd83bd8fdb50
55-
github.com/thanos-io/thanos v0.37.3-0.20250529092349-12649d8be797
55+
github.com/thanos-io/thanos v0.37.3-0.20250603135757-4ad45948cd10
5656
github.com/uber/jaeger-client-go v2.30.0+incompatible
5757
github.com/weaveworks/common v0.0.0-20230728070032-dd9e68f319d5
5858
go.etcd.io/etcd/api/v3 v3.5.17

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1697,8 +1697,8 @@ github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97 h1:VjG0mwhN1Dkn
16971697
github.com/thanos-io/objstore v0.0.0-20241111205755-d1dd89d41f97/go.mod h1:vyzFrBXgP+fGNG2FopEGWOO/zrIuoy7zt3LpLeezRsw=
16981698
github.com/thanos-io/promql-engine v0.0.0-20250522103302-dd83bd8fdb50 h1:RGdaDAyFOjrFJSjaPT2z8robLvQ3KxNiNEN3DojpLOs=
16991699
github.com/thanos-io/promql-engine v0.0.0-20250522103302-dd83bd8fdb50/go.mod h1:agUazAk1yHLYSL87MdEcRbjN12DJ9OZfSUcfFLqy+F8=
1700-
github.com/thanos-io/thanos v0.37.3-0.20250529092349-12649d8be797 h1:Co+TgEgln2gBoQJ7cjzD9f8Uj3+8MmqPnp33FvNOtYw=
1701-
github.com/thanos-io/thanos v0.37.3-0.20250529092349-12649d8be797/go.mod h1:hqWLLR6Yd7remOR33hRgVkg28Gx40Nh3mhWryB8RVJs=
1700+
github.com/thanos-io/thanos v0.37.3-0.20250603135757-4ad45948cd10 h1:mtmcivEm0EoXeHTJAgjXTthyQSTLNFWrPTzpiovau3Y=
1701+
github.com/thanos-io/thanos v0.37.3-0.20250603135757-4ad45948cd10/go.mod h1:2NvA8ZJtoGcOTriumDnJQzDmbxJz1ISGPovVAGGYDbg=
17021702
github.com/tjhop/slog-gokit v0.1.4 h1:uj/vbDt3HaF0Py8bHPV4ti/s0utnO0miRbO277FLBKM=
17031703
github.com/tjhop/slog-gokit v0.1.4/go.mod h1:Bbu5v2748qpAWH7k6gse/kw3076IJf6owJmh7yArmJs=
17041704
github.com/trivago/tgo v1.0.7 h1:uaWH/XIy9aWYWpjm2CU3RpcqZXmX2ysQ9/Go+d9gyrM=

pkg/ingester/ingester.go

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2566,18 +2566,16 @@ func (i *Ingester) createTSDB(userID string) (*userTSDB, error) {
25662566
// Create a new shipper for this database
25672567
if i.cfg.BlocksStorageConfig.TSDB.IsBlocksShippingEnabled() {
25682568
userDB.shipper = shipper.New(
2569-
userLogger,
2570-
tsdbPromReg,
2571-
udir,
25722569
bucket.NewUserBucketClient(userID, i.TSDBState.bucket, i.limits),
2573-
func() labels.Labels { return l },
2574-
metadata.ReceiveSource,
2575-
func() bool {
2576-
return i.cfg.UploadCompactedBlocksEnabled
2577-
},
2578-
true, // Allow out of order uploads. It's fine in Cortex's context.
2579-
metadata.NoneFunc,
2580-
"",
2570+
udir,
2571+
shipper.WithLogger(userLogger),
2572+
shipper.WithRegisterer(tsdbPromReg),
2573+
shipper.WithLabels(func() labels.Labels { return l }),
2574+
shipper.WithSource(metadata.ReceiveSource),
2575+
shipper.WithHashFunc(metadata.NoneFunc),
2576+
shipper.WithUploadCompacted(i.cfg.UploadCompactedBlocksEnabled),
2577+
shipper.WithAllowOutOfOrderUploads(true), // Allow out of order uploads. It's fine in Cortex's context.
2578+
shipper.WithSkipCorruptedBlocks(true), // We allow out of order uploads. This is the same behavior. We should track error with metrics
25812579
)
25822580
userDB.shipperMetadataFilePath = filepath.Join(userDB.db.Dir(), filepath.Clean(shipper.DefaultMetaFilename))
25832581

pkg/ingester/metrics.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,7 @@ type tsdbMetrics struct {
312312
dirSyncFailures *prometheus.Desc // sum(thanos_shipper_dir_sync_failures_total)
313313
uploads *prometheus.Desc // sum(thanos_shipper_uploads_total)
314314
uploadFailures *prometheus.Desc // sum(thanos_shipper_upload_failures_total)
315+
corruptedBlocks *prometheus.Desc // sum(thanos_shipper_corrupted_blocks_total)
315316

316317
// Metrics aggregated from TSDB.
317318
tsdbCompactionsTotal *prometheus.Desc
@@ -390,6 +391,10 @@ func newTSDBMetrics(r prometheus.Registerer) *tsdbMetrics {
390391
"cortex_ingester_shipper_upload_failures_total",
391392
"Total number of TSDB block upload failures",
392393
nil, nil),
394+
corruptedBlocks: prometheus.NewDesc(
395+
"cortex_ingester_shipper_corrupted_blocks_total",
396+
"Total number of TSDB blocks corrupted",
397+
nil, nil),
393398
tsdbCompactionsTotal: prometheus.NewDesc(
394399
"cortex_ingester_tsdb_compactions_total",
395400
"Total number of TSDB compactions that were executed.",
@@ -579,6 +584,7 @@ func (sm *tsdbMetrics) Describe(out chan<- *prometheus.Desc) {
579584
out <- sm.dirSyncFailures
580585
out <- sm.uploads
581586
out <- sm.uploadFailures
587+
out <- sm.corruptedBlocks
582588

583589
out <- sm.tsdbCompactionsTotal
584590
out <- sm.tsdbCompactionDuration
@@ -636,6 +642,7 @@ func (sm *tsdbMetrics) Collect(out chan<- prometheus.Metric) {
636642
data.SendSumOfCounters(out, sm.dirSyncFailures, "thanos_shipper_dir_sync_failures_total")
637643
data.SendSumOfCounters(out, sm.uploads, "thanos_shipper_uploads_total")
638644
data.SendSumOfCounters(out, sm.uploadFailures, "thanos_shipper_upload_failures_total")
645+
data.SendSumOfCounters(out, sm.corruptedBlocks, "thanos_shipper_corrupted_blocks_total")
639646

640647
data.SendSumOfCounters(out, sm.tsdbCompactionsTotal, "prometheus_tsdb_compactions_total")
641648
data.SendSumOfHistograms(out, sm.tsdbCompactionDuration, "prometheus_tsdb_compaction_duration_seconds")

pkg/ingester/metrics_test.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,11 @@ func TestTSDBMetrics(t *testing.T) {
187187
# 4*(12345 + 85787 + 999)
188188
cortex_ingester_shipper_upload_failures_total 396524
189189
190+
# HELP cortex_ingester_shipper_corrupted_blocks_total Total number of TSDB blocks corrupted
191+
# TYPE cortex_ingester_shipper_corrupted_blocks_total counter
192+
# 30*(12345 + 85787 + 999)
193+
cortex_ingester_shipper_corrupted_blocks_total 2973930
194+
190195
# HELP cortex_ingester_tsdb_compactions_total Total number of TSDB compactions that were executed.
191196
# TYPE cortex_ingester_tsdb_compactions_total counter
192197
cortex_ingester_tsdb_compactions_total 693917
@@ -446,6 +451,12 @@ func TestTSDBMetricsWithRemoval(t *testing.T) {
446451
# 4*(12345 + 85787 + 999)
447452
cortex_ingester_shipper_upload_failures_total 396524
448453
454+
# HELP cortex_ingester_shipper_corrupted_blocks_total Total number of TSDB blocks corrupted
455+
# TYPE cortex_ingester_shipper_corrupted_blocks_total counter
456+
# 30*(12345 + 85787 + 999)
457+
cortex_ingester_shipper_corrupted_blocks_total 2973930
458+
459+
449460
# HELP cortex_ingester_tsdb_compactions_total Total number of TSDB compactions that were executed.
450461
# TYPE cortex_ingester_tsdb_compactions_total counter
451462
cortex_ingester_tsdb_compactions_total 693917
@@ -688,6 +699,12 @@ func populateTSDBMetrics(base float64) *prometheus.Registry {
688699
})
689700
uploadFailures.Add(4 * base)
690701

702+
corruptedBlocks := promauto.With(r).NewCounter(prometheus.CounterOpts{
703+
Name: "thanos_shipper_corrupted_blocks_total",
704+
Help: "Total number of corrupted blocks",
705+
})
706+
corruptedBlocks.Add(30 * base)
707+
691708
// TSDB Head
692709
seriesCreated := promauto.With(r).NewCounter(prometheus.CounterOpts{
693710
Name: "prometheus_tsdb_head_series_created_total",

0 commit comments

Comments
 (0)