Skip to content

Commit b387afd

Browse files
nicolastakashijaronoff97swiatekm
authored
[CHORE] processing discovered targets async (#3517)
* Revert "Support configuring java runtime from configmap or secret (env.valueFrom)" (#3510) * Revert "Support configuring java runtime from configmap or secret (env.valueF…" This reverts commit 2b36f0d. * chlog (#3511) * [CHORE] changing log level Signed-off-by: Nicolas Takashi <[email protected]> * [CHORE] renaming method Signed-off-by: Nicolas Takashi <[email protected]> * [CHORE] adding change log entry Signed-off-by: Nicolas Takashi <[email protected]> * [CHORE] locking targets per job Signed-off-by: Nicolas Takashi <[email protected]> * Update .chloggen/discovering-target-async.yaml Co-authored-by: Mikołaj Świątek <[email protected]> * [REFACTORY] applying comments Signed-off-by: Nicolas Takashi <[email protected]> * [CHORE] adding mutex back Signed-off-by: Nicolas Takashi <[email protected]> --------- Signed-off-by: Nicolas Takashi <[email protected]> Co-authored-by: Jacob Aronoff <[email protected]> Co-authored-by: Mikołaj Świątek <[email protected]>
1 parent 5eefae8 commit b387afd

File tree

5 files changed

+187
-67
lines changed

5 files changed

+187
-67
lines changed
+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
2+
change_type: enhancement
3+
4+
# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action)
5+
component: target allocator
6+
7+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
8+
note: Process discovered targets asyncchronously
9+
10+
# One or more tracking issues related to the change
11+
issues: [1842]
12+
13+
# (Optional) One or more lines of additional information to render under the primary note.
14+
# These lines will be padded with 2 spaces and then inserted directly into the document.
15+
# Use pipe (|) for multiline entries.
16+
subtext: |
17+
This change enables the target allocator to process discovered targets asynchronously.
18+
This is a significant performance improvement for the target allocator, as it allows it to process targets in parallel, rather than sequentially.
19+
This change also introduces new metrics to track the performance of the target allocator.
20+
- opentelemetry_allocator_process_targets_duration_seconds: The duration of the process targets operation.
21+
- opentelemetry_allocator_process_target_groups_duration_seconds: The duration of the process target groups operation.

cmd/otel-allocator/benchmark_test.go

+13-15
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import (
2828
"github.com/prometheus/common/model"
2929
"github.com/prometheus/prometheus/discovery"
3030
"github.com/prometheus/prometheus/discovery/targetgroup"
31-
"github.com/prometheus/prometheus/model/labels"
3231
"github.com/prometheus/prometheus/model/relabel"
3332
"github.com/stretchr/testify/require"
3433
ctrl "sigs.k8s.io/controller-runtime"
@@ -45,18 +44,17 @@ import (
4544
// the HTTP server afterward. Test data is chosen to be reasonably representative of what the Prometheus service discovery
4645
// outputs in the real world.
4746
func BenchmarkProcessTargets(b *testing.B) {
48-
numTargets := 10000
47+
numTargets := 800000
4948
targetsPerGroup := 5
5049
groupsPerJob := 20
5150
tsets := prepareBenchmarkData(numTargets, targetsPerGroup, groupsPerJob)
52-
labelsBuilder := labels.NewBuilder(labels.EmptyLabels())
53-
54-
b.ResetTimer()
5551
for _, strategy := range allocation.GetRegisteredAllocatorNames() {
5652
b.Run(strategy, func(b *testing.B) {
57-
targetDiscoverer, allocator := createTestDiscoverer(strategy, map[string][]*relabel.Config{})
53+
targetDiscoverer := createTestDiscoverer(strategy, map[string][]*relabel.Config{})
54+
targetDiscoverer.UpdateTsets(tsets)
55+
b.ResetTimer()
5856
for i := 0; i < b.N; i++ {
59-
targetDiscoverer.ProcessTargets(labelsBuilder, tsets, allocator.SetTargets)
57+
targetDiscoverer.Reload()
6058
}
6159
})
6260
}
@@ -65,11 +63,10 @@ func BenchmarkProcessTargets(b *testing.B) {
6563
// BenchmarkProcessTargetsWithRelabelConfig is BenchmarkProcessTargets with a relabel config set. The relabel config
6664
// does not actually modify any records, but does force the prehook to perform any necessary conversions along the way.
6765
func BenchmarkProcessTargetsWithRelabelConfig(b *testing.B) {
68-
numTargets := 10000
66+
numTargets := 800000
6967
targetsPerGroup := 5
7068
groupsPerJob := 20
7169
tsets := prepareBenchmarkData(numTargets, targetsPerGroup, groupsPerJob)
72-
labelsBuilder := labels.NewBuilder(labels.EmptyLabels())
7370
prehookConfig := make(map[string][]*relabel.Config, len(tsets))
7471
for jobName := range tsets {
7572
// keep all targets in half the jobs, drop the rest
@@ -91,12 +88,13 @@ func BenchmarkProcessTargetsWithRelabelConfig(b *testing.B) {
9188
}
9289
}
9390

94-
b.ResetTimer()
9591
for _, strategy := range allocation.GetRegisteredAllocatorNames() {
9692
b.Run(strategy, func(b *testing.B) {
97-
targetDiscoverer, allocator := createTestDiscoverer(strategy, prehookConfig)
93+
targetDiscoverer := createTestDiscoverer(strategy, prehookConfig)
94+
targetDiscoverer.UpdateTsets(tsets)
95+
b.ResetTimer()
9896
for i := 0; i < b.N; i++ {
99-
targetDiscoverer.ProcessTargets(labelsBuilder, tsets, allocator.SetTargets)
97+
targetDiscoverer.Reload()
10098
}
10199
})
102100
}
@@ -172,7 +170,7 @@ func prepareBenchmarkData(numTargets, targetsPerGroup, groupsPerJob int) map[str
172170
return tsets
173171
}
174172

175-
func createTestDiscoverer(allocationStrategy string, prehookConfig map[string][]*relabel.Config) (*target.Discoverer, allocation.Allocator) {
173+
func createTestDiscoverer(allocationStrategy string, prehookConfig map[string][]*relabel.Config) *target.Discoverer {
176174
ctx := context.Background()
177175
logger := ctrl.Log.WithName(fmt.Sprintf("bench-%s", allocationStrategy))
178176
ctrl.SetLogger(logr.New(log.NullLogSink{}))
@@ -187,6 +185,6 @@ func createTestDiscoverer(allocationStrategy string, prehookConfig map[string][]
187185
registry := prometheus.NewRegistry()
188186
sdMetrics, _ := discovery.CreateAndRegisterSDMetrics(registry)
189187
discoveryManager := discovery.NewManager(ctx, gokitlog.NewNopLogger(), registry, sdMetrics)
190-
targetDiscoverer := target.NewDiscoverer(logger, discoveryManager, allocatorPrehook, srv)
191-
return targetDiscoverer, allocator
188+
targetDiscoverer := target.NewDiscoverer(logger, discoveryManager, allocatorPrehook, srv, allocator.SetTargets)
189+
return targetDiscoverer
192190
}

cmd/otel-allocator/main.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ func main() {
112112
}
113113
discoveryManager = discovery.NewManager(discoveryCtx, gokitlog.NewNopLogger(), prometheus.DefaultRegisterer, sdMetrics)
114114

115-
targetDiscoverer = target.NewDiscoverer(log, discoveryManager, allocatorPrehook, srv)
115+
targetDiscoverer = target.NewDiscoverer(log, discoveryManager, allocatorPrehook, srv, allocator.SetTargets)
116116
collectorWatcher, collectorWatcherErr := collector.NewCollectorWatcher(log, cfg.ClusterConfig)
117117
if collectorWatcherErr != nil {
118118
setupLog.Error(collectorWatcherErr, "Unable to initialize collector watcher")
@@ -175,7 +175,7 @@ func main() {
175175
setupLog.Info("Prometheus config empty, skipping initial discovery configuration")
176176
}
177177

178-
err := targetDiscoverer.Watch(allocator.SetTargets)
178+
err := targetDiscoverer.Run()
179179
setupLog.Info("Target discoverer exited")
180180
return err
181181
},

cmd/otel-allocator/target/discovery.go

+139-38
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ package target
1717
import (
1818
"hash"
1919
"hash/fnv"
20+
"sync"
21+
"time"
2022

2123
"github.com/go-logr/logr"
2224
"github.com/prometheus/client_golang/prometheus"
@@ -27,6 +29,7 @@ import (
2729
"github.com/prometheus/prometheus/discovery/targetgroup"
2830
"github.com/prometheus/prometheus/model/labels"
2931
"github.com/prometheus/prometheus/model/relabel"
32+
"go.uber.org/zap/zapcore"
3033
"gopkg.in/yaml.v3"
3134

3235
allocatorWatcher "github.com/open-telemetry/opentelemetry-operator/cmd/otel-allocator/watcher"
@@ -37,16 +40,33 @@ var (
3740
Name: "opentelemetry_allocator_targets",
3841
Help: "Number of targets discovered.",
3942
}, []string{"job_name"})
43+
44+
processTargetsDuration = promauto.NewHistogram(prometheus.HistogramOpts{
45+
Name: "opentelemetry_allocator_process_targets_duration_seconds",
46+
Help: "Duration of processing targets.",
47+
Buckets: []float64{1, 5, 10, 30, 60, 120},
48+
})
49+
50+
processTargetGroupsDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
51+
Name: "opentelemetry_allocator_process_target_groups_duration_seconds",
52+
Help: "Duration of processing target groups.",
53+
Buckets: []float64{1, 5, 10, 30, 60, 120},
54+
}, []string{"job_name"})
4055
)
4156

4257
type Discoverer struct {
43-
log logr.Logger
44-
manager *discovery.Manager
45-
close chan struct{}
46-
configsMap map[allocatorWatcher.EventSource][]*promconfig.ScrapeConfig
47-
hook discoveryHook
48-
scrapeConfigsHash hash.Hash
49-
scrapeConfigsUpdater scrapeConfigsUpdater
58+
log logr.Logger
59+
manager *discovery.Manager
60+
close chan struct{}
61+
mtxScrape sync.Mutex // Guards the fields below.
62+
configsMap map[allocatorWatcher.EventSource][]*promconfig.ScrapeConfig
63+
hook discoveryHook
64+
scrapeConfigsHash hash.Hash
65+
scrapeConfigsUpdater scrapeConfigsUpdater
66+
targetSets map[string][]*targetgroup.Group
67+
triggerReload chan struct{}
68+
processTargetsCallBack func(targets map[string]*Item)
69+
mtxTargets sync.Mutex
5070
}
5171

5272
type discoveryHook interface {
@@ -57,15 +77,17 @@ type scrapeConfigsUpdater interface {
5777
UpdateScrapeConfigResponse(map[string]*promconfig.ScrapeConfig) error
5878
}
5979

60-
func NewDiscoverer(log logr.Logger, manager *discovery.Manager, hook discoveryHook, scrapeConfigsUpdater scrapeConfigsUpdater) *Discoverer {
80+
func NewDiscoverer(log logr.Logger, manager *discovery.Manager, hook discoveryHook, scrapeConfigsUpdater scrapeConfigsUpdater, setTargets func(targets map[string]*Item)) *Discoverer {
6181
return &Discoverer{
62-
log: log,
63-
manager: manager,
64-
close: make(chan struct{}),
65-
configsMap: make(map[allocatorWatcher.EventSource][]*promconfig.ScrapeConfig),
66-
hook: hook,
67-
scrapeConfigsHash: nil, // we want the first update to succeed even if the config is empty
68-
scrapeConfigsUpdater: scrapeConfigsUpdater,
82+
log: log,
83+
manager: manager,
84+
close: make(chan struct{}),
85+
triggerReload: make(chan struct{}, 1),
86+
configsMap: make(map[allocatorWatcher.EventSource][]*promconfig.ScrapeConfig),
87+
hook: hook,
88+
scrapeConfigsHash: nil, // we want the first update to succeed even if the config is empty
89+
scrapeConfigsUpdater: scrapeConfigsUpdater,
90+
processTargetsCallBack: setTargets,
6991
}
7092
}
7193

@@ -105,43 +127,122 @@ func (m *Discoverer) ApplyConfig(source allocatorWatcher.EventSource, scrapeConf
105127
return m.manager.ApplyConfig(discoveryCfg)
106128
}
107129

108-
func (m *Discoverer) Watch(fn func(targets map[string]*Item)) error {
109-
labelsBuilder := labels.NewBuilder(labels.EmptyLabels())
130+
func (m *Discoverer) Run() error {
131+
err := m.run(m.manager.SyncCh())
132+
if err != nil {
133+
m.log.Error(err, "Service Discovery watch event failed")
134+
return err
135+
}
136+
<-m.close
137+
m.log.Info("Service Discovery watch event stopped: discovery manager closed")
138+
return nil
139+
}
140+
141+
// UpdateTsets updates the target sets to be scraped.
142+
func (m *Discoverer) UpdateTsets(tsets map[string][]*targetgroup.Group) {
143+
m.mtxScrape.Lock()
144+
m.targetSets = tsets
145+
m.mtxScrape.Unlock()
146+
}
147+
148+
// reloader triggers a reload of the scrape configs at regular intervals.
149+
// The time between reloads is defined by reloadIntervalDuration to avoid overloading the system
150+
// with too many reloads, because some service discovery mechanisms can be quite chatty.
151+
func (m *Discoverer) reloader() {
152+
reloadIntervalDuration := model.Duration(5 * time.Second)
153+
ticker := time.NewTicker(time.Duration(reloadIntervalDuration))
154+
155+
defer ticker.Stop()
156+
110157
for {
111158
select {
112159
case <-m.close:
113-
m.log.Info("Service Discovery watch event stopped: discovery manager closed")
114-
return nil
115-
case tsets := <-m.manager.SyncCh():
116-
m.ProcessTargets(labelsBuilder, tsets, fn)
160+
return
161+
case <-ticker.C:
162+
select {
163+
case <-m.triggerReload:
164+
m.Reload()
165+
case <-m.close:
166+
return
167+
}
117168
}
118169
}
119170
}
120171

121-
func (m *Discoverer) ProcessTargets(builder *labels.Builder, tsets map[string][]*targetgroup.Group, fn func(targets map[string]*Item)) {
172+
// Reload triggers a reload of the scrape configs.
173+
// This will process the target groups and update the targets concurrently.
174+
func (m *Discoverer) Reload() {
175+
m.mtxScrape.Lock()
176+
var wg sync.WaitGroup
122177
targets := map[string]*Item{}
178+
timer := prometheus.NewTimer(processTargetsDuration)
179+
defer timer.ObserveDuration()
180+
181+
for jobName, groups := range m.targetSets {
182+
wg.Add(1)
183+
// Run the sync in parallel as these take a while and at high load can't catch up.
184+
go func(jobName string, groups []*targetgroup.Group) {
185+
processedTargets := m.processTargetGroups(jobName, groups)
186+
m.mtxTargets.Lock()
187+
for k, v := range processedTargets {
188+
targets[k] = v
189+
}
190+
m.mtxTargets.Unlock()
191+
wg.Done()
192+
}(jobName, groups)
193+
}
194+
m.mtxScrape.Unlock()
195+
wg.Wait()
196+
m.processTargetsCallBack(targets)
197+
}
123198

124-
for jobName, tgs := range tsets {
125-
var count float64 = 0
126-
for _, tg := range tgs {
127-
builder.Reset(labels.EmptyLabels())
128-
for ln, lv := range tg.Labels {
199+
// processTargetGroups processes the target groups and returns a map of targets.
200+
func (m *Discoverer) processTargetGroups(jobName string, groups []*targetgroup.Group) map[string]*Item {
201+
builder := labels.NewBuilder(labels.Labels{})
202+
timer := prometheus.NewTimer(processTargetGroupsDuration.WithLabelValues(jobName))
203+
targets := map[string]*Item{}
204+
defer timer.ObserveDuration()
205+
var count float64 = 0
206+
for _, tg := range groups {
207+
builder.Reset(labels.EmptyLabels())
208+
for ln, lv := range tg.Labels {
209+
builder.Set(string(ln), string(lv))
210+
}
211+
groupLabels := builder.Labels()
212+
for _, t := range tg.Targets {
213+
count++
214+
builder.Reset(groupLabels)
215+
for ln, lv := range t {
129216
builder.Set(string(ln), string(lv))
130217
}
131-
groupLabels := builder.Labels()
132-
for _, t := range tg.Targets {
133-
count++
134-
builder.Reset(groupLabels)
135-
for ln, lv := range t {
136-
builder.Set(string(ln), string(lv))
137-
}
138-
item := NewItem(jobName, string(t[model.AddressLabel]), builder.Labels(), "")
139-
targets[item.Hash()] = item
218+
item := NewItem(jobName, string(t[model.AddressLabel]), builder.Labels(), "")
219+
targets[item.Hash()] = item
220+
}
221+
}
222+
targetsDiscovered.WithLabelValues(jobName).Set(count)
223+
return targets
224+
}
225+
226+
// Run receives and saves target set updates and triggers the scraping loops reloading.
227+
// Reloading happens in the background so that it doesn't block receiving targets updates.
228+
func (m *Discoverer) run(tsets <-chan map[string][]*targetgroup.Group) error {
229+
go m.reloader()
230+
for {
231+
select {
232+
case ts := <-tsets:
233+
m.log.V(int(zapcore.DebugLevel)).Info("Service Discovery watch event received", "targets groups", len(ts))
234+
m.UpdateTsets(ts)
235+
236+
select {
237+
case m.triggerReload <- struct{}{}:
238+
default:
140239
}
240+
241+
case <-m.close:
242+
m.log.Info("Service Discovery watch event stopped: discovery manager closed")
243+
return nil
141244
}
142-
targetsDiscovered.WithLabelValues(jobName).Set(count)
143245
}
144-
fn(targets)
145246
}
146247

147248
func (m *Discoverer) Close() {

0 commit comments

Comments
 (0)