Skip to content

Commit 70731ad

Browse files
feat: add plugin execution tracing to OpenTelemetry plugin
This commit adds plugin execution tracing capability to the OpenTelemetry plugin, allowing users to trace individual plugin phases (rewrite, access, header_filter, body_filter, log) as child spans of the main request trace. Changes: - Added trace_plugins configuration option (default: false, opt-in) - Added plugin_span_kind configuration for observability provider compatibility - Enhanced plugin execution with OpenTelemetry span creation and finishing - Added comprehensive request context attributes to plugin spans - Updated documentation with examples and usage instructions - Added comprehensive test suite for the new functionality Features: - Plugin Phase Tracing: Creates child spans for each plugin phase execution - Rich Context: Includes HTTP method, URI, hostname, user agent, route info, and service info - Configurable: Can be enabled/disabled via trace_plugins configuration - Span Kind Control: Supports internal (default) and server span kinds for observability provider compatibility - Proper Hierarchy: Plugin spans are correctly nested under main request spans - Performance: Minimal overhead when disabled (default behavior) Configuration: - trace_plugins: boolean (default: false) - Enable/disable plugin tracing - plugin_span_kind: string (default: 'internal') - Span kind for plugin spans - 'internal': Standard internal operation (may be excluded from metrics) - 'server': Server-side operation (typically included in service-level metrics) This addresses GitHub issue #12510 and provides end-to-end tracing visibility for APISIX plugin execution phases.
1 parent ebe8d6c commit 70731ad

File tree

5 files changed

+772
-4
lines changed

5 files changed

+772
-4
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ A/B testing, canary release, blue-green deployment, limit rate, defense against
123123
- **OPS friendly**
124124

125125
- Zipkin tracing: [Zipkin](docs/en/latest/plugins/zipkin.md)
126+
- OpenTelemetry tracing: [OpenTelemetry](docs/en/latest/plugins/opentelemetry.md) with plugin execution tracing
126127
- Open source APM: support [Apache SkyWalking](docs/en/latest/plugins/skywalking.md)
127128
- Works with external service discovery: In addition to the built-in etcd, it also supports [Consul](docs/en/latest/discovery/consul.md), [Consul_kv](docs/en/latest/discovery/consul_kv.md), [Nacos](docs/en/latest/discovery/nacos.md), [Eureka](docs/en/latest/discovery/eureka.md) and [Zookeeper (CP)](https://github.com/api7/apisix-seed/blob/main/docs/en/latest/zookeeper.md).
128129
- Monitoring And Metrics: [Prometheus](docs/en/latest/plugins/prometheus.md)

apisix/plugin.lua

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ local tostring = tostring
3838
local error = error
3939
local getmetatable = getmetatable
4040
local setmetatable = setmetatable
41+
local string_format = string.format
4142
-- make linter happy to avoid error: getting the Lua global "load"
4243
-- luacheck: globals load, ignore lua_load
4344
local lua_load = load
@@ -1169,6 +1170,9 @@ function _M.run_plugin(phase, plugins, api_ctx)
11691170
return api_ctx
11701171
end
11711172

1173+
-- Get OpenTelemetry plugin for tracing
1174+
local otel_plugin = _M.get("opentelemetry")
1175+
11721176
if phase ~= "log"
11731177
and phase ~= "header_filter"
11741178
and phase ~= "body_filter"
@@ -1188,11 +1192,40 @@ function _M.run_plugin(phase, plugins, api_ctx)
11881192
goto CONTINUE
11891193
end
11901194

1195+
-- Start OpenTelemetry plugin span
1196+
if otel_plugin and otel_plugin.start_plugin_span and api_ctx.otel then
1197+
otel_plugin.start_plugin_span(api_ctx, plugins[i]["name"], phase)
1198+
end
1199+
11911200
run_meta_pre_function(conf, api_ctx, plugins[i]["name"])
11921201
plugin_run = true
11931202
api_ctx._plugin_name = plugins[i]["name"]
1194-
local code, body = phase_func(conf, api_ctx)
1203+
local error_msg = nil
1204+
local code, body
1205+
1206+
-- Execute plugin with error handling
1207+
local ok, result = pcall(phase_func, conf, api_ctx)
1208+
if not ok then
1209+
-- Lua exception occurred
1210+
error_msg = string_format("plugin execution failed: %s", result)
1211+
code = 500
1212+
body = nil
1213+
else
1214+
-- Plugin executed successfully, check return values
1215+
code, body = result, nil
1216+
if code and code >= 400 then
1217+
error_msg = string_format("plugin exited with status code %d", code)
1218+
end
1219+
end
1220+
11951221
api_ctx._plugin_name = nil
1222+
1223+
-- Finish OpenTelemetry plugin span (with performance guard)
1224+
if otel_plugin and otel_plugin.finish_plugin_span and
1225+
api_ctx.otel then
1226+
otel_plugin.finish_plugin_span(api_ctx, plugins[i]["name"], phase, error_msg)
1227+
end
1228+
11961229
if code or body then
11971230
if is_http then
11981231
if code >= 400 then
@@ -1226,11 +1259,27 @@ function _M.run_plugin(phase, plugins, api_ctx)
12261259
local phase_func = plugins[i][phase]
12271260
local conf = plugins[i + 1]
12281261
if phase_func and meta_filter(api_ctx, plugins[i]["name"], conf) then
1262+
-- Start OpenTelemetry plugin span (with performance guard)
1263+
if otel_plugin and otel_plugin.start_plugin_span and api_ctx.otel then
1264+
otel_plugin.start_plugin_span(api_ctx, plugins[i]["name"], phase)
1265+
end
1266+
12291267
plugin_run = true
12301268
run_meta_pre_function(conf, api_ctx, plugins[i]["name"])
12311269
api_ctx._plugin_name = plugins[i]["name"]
1232-
phase_func(conf, api_ctx)
1270+
1271+
local error_msg = nil
1272+
local ok, err = pcall(phase_func, conf, api_ctx)
1273+
if not ok then
1274+
error_msg = err
1275+
end
1276+
12331277
api_ctx._plugin_name = nil
1278+
1279+
-- Finish OpenTelemetry plugin span (with performance guard)
1280+
if otel_plugin and otel_plugin.finish_plugin_span and api_ctx.otel then
1281+
otel_plugin.finish_plugin_span(api_ctx, plugins[i]["name"], phase, error_msg)
1282+
end
12341283
end
12351284
end
12361285

apisix/plugins/opentelemetry.lua

Lines changed: 231 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,20 @@ local schema = {
182182
type = "string",
183183
minLength = 1,
184184
}
185+
},
186+
trace_plugins = {
187+
type = "boolean",
188+
description = "whether to trace individual plugin execution",
189+
default = false
190+
},
191+
plugin_span_kind = {
192+
type = "string",
193+
enum = {"internal", "server"},
194+
description = "span kind for plugin execution spans. "
195+
.. "Some observability providers may exclude internal spans from metrics "
196+
.. "and dashboards. Use 'server' if you need plugin spans included in "
197+
.. "service-level metrics.",
198+
default = "internal"
185199
}
186200
}
187201
}
@@ -306,6 +320,166 @@ local function inject_attributes(attributes, wanted_attributes, source, with_pre
306320
end
307321

308322

323+
-- Plugin span management functions
324+
-- =================================
325+
326+
-- Get or create main plugin span
327+
local function get_or_create_plugin_span(api_ctx, plugin_name)
328+
if not api_ctx.otel then
329+
return nil
330+
end
331+
332+
if not api_ctx.otel_plugin_spans then
333+
api_ctx.otel_plugin_spans = {}
334+
end
335+
336+
if not api_ctx.otel_plugin_spans[plugin_name] then
337+
local span_ctx = api_ctx.otel.start_span({
338+
name = plugin_name,
339+
kind = api_ctx.otel_plugin_span_kind,
340+
attributes = { attr.string("apisix.plugin_name", plugin_name) }
341+
})
342+
343+
api_ctx.otel_plugin_spans[plugin_name] = {
344+
span_ctx = span_ctx,
345+
phases = {}
346+
}
347+
end
348+
349+
return api_ctx.otel_plugin_spans[plugin_name]
350+
end
351+
352+
-- Create phase span (child of plugin span)
353+
local function create_phase_span(api_ctx, plugin_name, phase)
354+
local plugin_data = get_or_create_plugin_span(api_ctx, plugin_name)
355+
if not plugin_data then
356+
return nil
357+
end
358+
359+
local phase_span_ctx = api_ctx.otel.start_span({
360+
parent = plugin_data.span_ctx,
361+
name = plugin_name .. " " .. phase,
362+
kind = span_kind.internal,
363+
attributes = {
364+
attr.string("apisix.plugin_name", plugin_name),
365+
attr.string("apisix.plugin_phase", phase),
366+
}
367+
})
368+
369+
plugin_data.phases[phase] = phase_span_ctx
370+
-- Set current phase for child spans to use as parent
371+
api_ctx._current_phase = phase
372+
return phase_span_ctx
373+
end
374+
375+
-- Finish phase span and plugin span if all phases done
376+
local function finish_phase_span(api_ctx, plugin_name, phase, error_msg)
377+
if not (api_ctx.otel_plugin_spans and api_ctx.otel_plugin_spans[plugin_name]) then
378+
return
379+
end
380+
381+
local plugin_data = api_ctx.otel_plugin_spans[plugin_name]
382+
local phase_span_ctx = plugin_data.phases[phase]
383+
384+
if phase_span_ctx then
385+
api_ctx.otel.stop_span(phase_span_ctx, error_msg)
386+
plugin_data.phases[phase] = nil
387+
-- Clear current phase when phase span is finished
388+
if api_ctx._current_phase == phase then
389+
api_ctx._current_phase = nil
390+
end
391+
end
392+
393+
-- Finish plugin span if no phases left
394+
local phases_left = 0
395+
for _ in pairs(plugin_data.phases) do
396+
phases_left = phases_left + 1
397+
end
398+
399+
if phases_left == 0 then
400+
api_ctx.otel.stop_span(plugin_data.span_ctx)
401+
api_ctx.otel_plugin_spans[plugin_name] = nil
402+
end
403+
end
404+
405+
-- Cleanup all plugin spans
406+
local function cleanup_plugin_spans(api_ctx)
407+
if not api_ctx.otel_plugin_spans then
408+
return
409+
end
410+
411+
for plugin_name, plugin_data in pairs(api_ctx.otel_plugin_spans) do
412+
-- Finish remaining phase spans
413+
for phase, phase_span_ctx in pairs(plugin_data.phases) do
414+
api_ctx.otel.stop_span(phase_span_ctx, "plugin cleanup")
415+
end
416+
417+
-- Finish main plugin span
418+
api_ctx.otel.stop_span(plugin_data.span_ctx, "plugin cleanup")
419+
end
420+
421+
api_ctx.otel_plugin_spans = nil
422+
api_ctx._current_phase = nil
423+
end
424+
425+
426+
-- OpenTelemetry API for plugins
427+
-- =============================
428+
429+
-- Create simple OpenTelemetry API for plugins
430+
local function create_otel_api(api_ctx, tracer, main_context)
431+
return {
432+
start_span = function(span_info)
433+
if not (span_info and span_info.name) then
434+
return nil
435+
end
436+
437+
-- Get parent context (prioritize current phase span, then plugin span, then main)
438+
local plugin_data = api_ctx._plugin_name and api_ctx.otel_plugin_spans and
439+
api_ctx.otel_plugin_spans[api_ctx._plugin_name]
440+
-- Try to use parent context if provided
441+
-- Fallback to current phase span if available
442+
-- Fallback to plugin span if available
443+
-- Fallback to main context
444+
local parent_context = span_info.parent or
445+
(plugin_data and api_ctx._current_phase and plugin_data.phases and
446+
plugin_data.phases[api_ctx._current_phase]) or
447+
(plugin_data and plugin_data.span_ctx) or
448+
main_context
449+
450+
-- Use the provided kind directly (users should pass span_kind constants)
451+
local span_kind_value = span_info.kind or span_kind.internal
452+
local attributes = span_info.attributes or {}
453+
return tracer:start(parent_context, span_info.name, {
454+
kind = span_kind_value,
455+
attributes = attributes,
456+
})
457+
end,
458+
459+
stop_span = function(span_ctx, error_msg)
460+
if not span_ctx then
461+
return
462+
end
463+
464+
local span = span_ctx:span()
465+
if not span then
466+
return
467+
end
468+
469+
if error_msg then
470+
span:set_status(span_status.ERROR, error_msg)
471+
end
472+
473+
span:finish()
474+
end,
475+
476+
477+
get_plugin_context = function(plugin_name)
478+
return api_ctx.otel_plugin_spans and api_ctx.otel_plugin_spans[plugin_name]
479+
end
480+
}
481+
end
482+
309483
function _M.rewrite(conf, api_ctx)
310484
local metadata = plugin.plugin_metadata(plugin_name)
311485
if metadata == nil then
@@ -323,7 +497,7 @@ function _M.rewrite(conf, api_ctx)
323497
return
324498
end
325499

326-
local span_name = vars.method
500+
local span_name = string_format("http.%s", vars.method)
327501

328502
local attributes = {
329503
attr.string("net.host.name", vars.host),
@@ -337,7 +511,7 @@ function _M.rewrite(conf, api_ctx)
337511
table.insert(attributes, attr.string("apisix.route_id", api_ctx.route_id))
338512
table.insert(attributes, attr.string("apisix.route_name", api_ctx.route_name))
339513
table.insert(attributes, attr.string("http.route", api_ctx.curr_req_matched._path))
340-
span_name = span_name .. " " .. api_ctx.curr_req_matched._path
514+
span_name = string_format("http.%s %s", vars.method, api_ctx.curr_req_matched._path)
341515
end
342516

343517
if api_ctx.service_id then
@@ -378,6 +552,21 @@ function _M.rewrite(conf, api_ctx)
378552

379553
api_ctx.otel_context_token = ctx:attach()
380554

555+
-- Store tracer and configuration for plugin tracing
556+
if conf.trace_plugins then
557+
-- Map string span kind to span_kind constant
558+
local kind_mapping = {
559+
internal = span_kind.internal,
560+
server = span_kind.server,
561+
}
562+
local span_kind_value = conf.plugin_span_kind or "internal"
563+
api_ctx.otel_plugin_span_kind = kind_mapping[span_kind_value] or span_kind.internal
564+
565+
-- Create OpenTelemetry API for plugins
566+
api_ctx.otel = create_otel_api(api_ctx, tracer, ctx)
567+
568+
end
569+
381570
-- inject trace context into the headers of upstream HTTP request
382571
trace_context_propagator:inject(ctx, ngx.req)
383572
end
@@ -400,6 +589,8 @@ function _M.delayed_body_filter(conf, api_ctx)
400589
span:set_attributes(attr.int("http.status_code", upstream_status))
401590

402591
span:finish()
592+
-- Cleanup plugin spans
593+
cleanup_plugin_spans(api_ctx)
403594
end
404595
end
405596

@@ -419,7 +610,45 @@ function _M.log(conf, api_ctx)
419610
end
420611

421612
span:finish()
613+
-- Clear the context token to prevent double finishing
614+
api_ctx.otel_context_token = nil
615+
616+
-- Cleanup plugin spans (guaranteed cleanup on request end)
617+
cleanup_plugin_spans(api_ctx)
618+
end
619+
end
620+
621+
622+
-- Public functions for plugin tracing integration
623+
-- ===============================================
624+
625+
-- Start plugin phase span
626+
function _M.start_plugin_span(api_ctx, plugin_name, phase)
627+
if not api_ctx.otel then
628+
return nil
629+
end
630+
631+
-- Prevent recursion: don't trace the OpenTelemetry plugin itself
632+
if plugin_name == "opentelemetry" then
633+
return nil
422634
end
635+
636+
return create_phase_span(api_ctx, plugin_name, phase)
637+
end
638+
639+
640+
-- Finish plugin phase span
641+
function _M.finish_plugin_span(api_ctx, plugin_name, phase, error_msg)
642+
if not api_ctx.otel then
643+
return
644+
end
645+
646+
-- Prevent recursion: don't trace the OpenTelemetry plugin itself
647+
if plugin_name == "opentelemetry" then
648+
return
649+
end
650+
651+
finish_phase_span(api_ctx, plugin_name, phase, error_msg)
423652
end
424653

425654

0 commit comments

Comments
 (0)