Skip to content

Commit 7705fa7

Browse files
committed
XXX add basic otel tracing
1 parent 29fb013 commit 7705fa7

File tree

5 files changed

+203
-1
lines changed

5 files changed

+203
-1
lines changed

dropshot/Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ tokio-rustls = "0.25.0"
4848
toml = "0.8.19"
4949
waitgroup = "0.1.2"
5050

51+
opentelemetry = { version = "0.27", optional = true }
52+
opentelemetry-http = { version = "0.27", features = ["hyper"], optional = true }
53+
opentelemetry-semantic-conventions = { version = "0.27", optional = true }
54+
tracing = { version = "0.1", optional = true }
55+
5156
[dependencies.chrono]
5257
version = "0.4.38"
5358
features = [ "serde", "std", "clock" ]
@@ -136,6 +141,8 @@ version_check = "0.9.5"
136141
[features]
137142
usdt-probes = ["usdt/asm"]
138143
internal-docs = ["simple-mermaid"]
144+
otel-tracing = ["opentelemetry", "opentelemetry-http", "opentelemetry-semantic-conventions"]
145+
tokio-tracing = ["tracing"]
139146

140147
[package.metadata.docs.rs]
141148
features = ["internal-docs"]

dropshot/src/handler.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ pub struct RequestContext<Context: ServerContext> {
8686
pub log: Logger,
8787
/// basic request information (method, URI, etc.)
8888
pub request: RequestInfo,
89+
#[cfg(feature = "otel-tracing")]
90+
pub span_context: opentelemetry::trace::SpanContext,
8991
}
9092

9193
// This is deliberately as close to compatible with `hyper::Request` as
@@ -377,6 +379,19 @@ where
377379
}
378380
}
379381

382+
impl std::fmt::Display for HandlerError {
383+
fn fmt(&self, f: &mut Formatter) -> FmtResult {
384+
write!(
385+
f,
386+
"{}",
387+
match self {
388+
Self::Handler { ref message, .. } => message,
389+
Self::Dropshot(ref e) => &e.external_message,
390+
}
391+
)
392+
}
393+
}
394+
380395
/// An error type which can be converted into an HTTP response.
381396
///
382397
/// The error types returned by handlers must implement this trait, so that a

dropshot/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,8 @@ mod from_map;
834834
mod handler;
835835
mod http_util;
836836
mod logging;
837+
#[cfg(feature = "otel-tracing")]
838+
mod otel;
837839
mod pagination;
838840
mod router;
839841
mod schema_util;

dropshot/src/otel.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
// Copyright 2024 Oxide Computer Company
2+
//! Opentelemetry tracing support
3+
//!
4+
//! Fields that we want to produce to provide comparable
5+
//! functionality to reqwest-tracing[1]:
6+
//!
7+
//! - http.request.method
8+
//! - url.scheme
9+
//! - server.address
10+
//! - server.port
11+
//! - otel.kind
12+
//! - otel.name
13+
//! - otel.status_code
14+
//! - user_agent.original
15+
//! - http.response.status_code
16+
//! - error.message
17+
//! - error.cause_chain
18+
//!
19+
//! [1] <https://docs.rs/reqwest-tracing/0.5.4/reqwest_tracing/macro.reqwest_otel_span.html>
20+
21+
use opentelemetry::{global, trace::Span, trace::TracerProvider, trace::Tracer};
22+
use opentelemetry_http::HeaderExtractor;
23+
use opentelemetry_semantic_conventions::trace;
24+
25+
// - http.request.method
26+
// - url.scheme
27+
// - server.address
28+
// - server.port
29+
// - otel.kind
30+
// - otel.name
31+
// - otel.status_code
32+
// - user_agent.original
33+
#[derive(Debug, Clone, serde::Serialize)]
34+
pub(crate) struct RequestInfo {
35+
pub id: String,
36+
pub local_addr: std::net::SocketAddr,
37+
pub remote_addr: std::net::SocketAddr,
38+
pub method: String,
39+
pub path: String,
40+
pub query: Option<String>,
41+
}
42+
43+
#[derive(Debug, Clone, serde::Serialize)]
44+
pub(crate) struct ResponseInfo {
45+
pub id: String,
46+
pub local_addr: std::net::SocketAddr,
47+
pub remote_addr: std::net::SocketAddr,
48+
pub status_code: u16,
49+
pub message: String,
50+
}
51+
52+
fn extract_context_from_request(
53+
request: &hyper::Request<hyper::body::Incoming>,
54+
) -> opentelemetry::Context {
55+
global::get_text_map_propagator(|propagator| {
56+
propagator.extract(&HeaderExtractor(request.headers()))
57+
})
58+
}
59+
60+
pub fn create_request_span(
61+
request: &hyper::Request<hyper::body::Incoming>,
62+
) -> opentelemetry::global::BoxedSpan {
63+
let tracer_provider = global::tracer_provider();
64+
let scope = opentelemetry::InstrumentationScope::builder("dropshot_tracing")
65+
.with_version(env!("CARGO_PKG_VERSION"))
66+
.with_schema_url("https://opentelemetry.io/schemas/1.17.0")
67+
.build();
68+
let tracer = tracer_provider.tracer_with_scope(scope);
69+
let parent_cx = extract_context_from_request(&request);
70+
tracer
71+
.span_builder("dropshot_request") //XXX Fixme
72+
.with_kind(opentelemetry::trace::SpanKind::Server)
73+
.start_with_context(&tracer, &parent_cx)
74+
}
75+
76+
pub trait TraceDropshot {
77+
fn trace_request(&mut self, request: RequestInfo);
78+
fn trace_response(&mut self, response: ResponseInfo);
79+
}
80+
81+
impl TraceDropshot for opentelemetry::global::BoxedSpan {
82+
fn trace_request(&mut self, request: RequestInfo) {
83+
self.set_attributes(vec![
84+
opentelemetry::KeyValue::new("http.id".to_string(), request.id),
85+
opentelemetry::KeyValue::new(
86+
"http.method".to_string(),
87+
request.method,
88+
),
89+
opentelemetry::KeyValue::new("http.path".to_string(), request.path),
90+
]);
91+
}
92+
fn trace_response(&mut self, response: ResponseInfo) {
93+
self.set_attributes(vec![
94+
opentelemetry::KeyValue::new(
95+
trace::HTTP_RESPONSE_STATUS_CODE,
96+
i64::from(response.status_code),
97+
),
98+
opentelemetry::KeyValue::new(
99+
"http.message".to_string(),
100+
response.message,
101+
),
102+
]);
103+
}
104+
}

dropshot/src/server.rs

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ use super::router::HttpRouter;
1313
use super::versioning::VersionPolicy;
1414
use super::ProbeRegistration;
1515

16+
#[cfg(feature = "otel-tracing")]
17+
use crate::{otel, otel::TraceDropshot};
18+
#[cfg(feature = "otel-tracing")]
19+
use opentelemetry::trace::Span;
20+
1621
use async_stream::stream;
1722
use debug_ignore::DebugIgnore;
1823
use futures::future::{
@@ -730,6 +735,10 @@ impl<C: ServerContext> FusedFuture for HttpServer<C> {
730735
/// invoked by Hyper when a new request is received. This function returns a
731736
/// Result that either represents a valid HTTP response or an error (which will
732737
/// also get turned into an HTTP response).
738+
#[cfg_attr(feature = "tokio-tracing", tracing::instrument(
739+
err,
740+
skip_all,
741+
))]
733742
async fn http_request_handle_wrap<C: ServerContext>(
734743
server: Arc<DropshotState<C>>,
735744
remote_addr: SocketAddr,
@@ -774,6 +783,18 @@ async fn http_request_handle_wrap<C: ServerContext>(
774783
}
775784
}
776785

786+
#[cfg(feature = "otel-tracing")]
787+
let mut span = otel::create_request_span(&request);
788+
#[cfg(feature = "otel-tracing")]
789+
span.trace_request(crate::otel::RequestInfo {
790+
id: request_id.clone(),
791+
local_addr: server.local_addr,
792+
remote_addr,
793+
method: request.method().to_string(),
794+
path: request.uri().path().to_string(),
795+
query: request.uri().query().map(|x| x.to_string()),
796+
});
797+
777798
trace!(request_log, "incoming request");
778799
#[cfg(feature = "usdt-probes")]
779800
probes::request__start!(|| {
@@ -790,9 +811,12 @@ async fn http_request_handle_wrap<C: ServerContext>(
790811

791812
// Copy local address to report later during the finish probe, as the
792813
// server is passed by value to the request handler function.
793-
#[cfg(feature = "usdt-probes")]
814+
#[cfg(any(feature = "usdt-probes", feature = "otel-tracing"))]
794815
let local_addr = server.local_addr;
795816

817+
#[cfg(feature = "otel-tracing")]
818+
let span_context = span.span_context().clone();
819+
796820
// In the case the client disconnects early, the scopeguard allows us
797821
// to perform extra housekeeping before this task is dropped.
798822
let on_disconnect = guard((), |_| {
@@ -802,6 +826,18 @@ async fn http_request_handle_wrap<C: ServerContext>(
802826
"latency_us" => latency_us,
803827
);
804828

829+
#[cfg(feature = "otel-tracing")]
830+
span.trace_response(crate::otel::ResponseInfo {
831+
id: request_id.clone(),
832+
local_addr,
833+
remote_addr,
834+
// 499 is a non-standard code popularized by nginx to mean "client disconnected".
835+
status_code: 499,
836+
message: String::from(
837+
"client disconnected before response returned",
838+
),
839+
});
840+
805841
#[cfg(feature = "usdt-probes")]
806842
probes::request__done!(|| {
807843
crate::dtrace::ResponseInfo {
@@ -823,6 +859,8 @@ async fn http_request_handle_wrap<C: ServerContext>(
823859
&request_id,
824860
request_log.new(o!()),
825861
remote_addr,
862+
#[cfg(feature = "otel-tracing")]
863+
span_context,
826864
)
827865
.await;
828866

@@ -838,6 +876,17 @@ async fn http_request_handle_wrap<C: ServerContext>(
838876
let message_external = error.external_message();
839877
let message_internal = error.internal_message();
840878

879+
#[cfg(feature = "otel-tracing")]
880+
span.trace_response(crate::otel::ResponseInfo {
881+
id: request_id.clone(),
882+
local_addr,
883+
remote_addr,
884+
status_code: status.as_u16(),
885+
message: message_external
886+
.cloned()
887+
.unwrap_or_else(|| message_internal.clone()),
888+
});
889+
841890
#[cfg(feature = "usdt-probes")]
842891
probes::request__done!(|| {
843892
crate::dtrace::ResponseInfo {
@@ -869,6 +918,15 @@ async fn http_request_handle_wrap<C: ServerContext>(
869918
"latency_us" => latency_us,
870919
);
871920

921+
#[cfg(feature = "otel-tracing")]
922+
span.trace_response(crate::otel::ResponseInfo {
923+
id: request_id.parse().unwrap(),
924+
local_addr,
925+
remote_addr,
926+
status_code: response.status().as_u16(),
927+
message: "".to_string(),
928+
});
929+
872930
#[cfg(feature = "usdt-probes")]
873931
probes::request__done!(|| {
874932
crate::dtrace::ResponseInfo {
@@ -887,12 +945,26 @@ async fn http_request_handle_wrap<C: ServerContext>(
887945
Ok(response)
888946
}
889947

948+
#[cfg_attr(feature = "tokio-tracing", tracing::instrument(
949+
err,
950+
skip_all,
951+
fields(
952+
http.method = request.method().as_str().to_string(),
953+
http.uri = request.uri().to_string(),
954+
http.version = format!("{:#?}",request.version()),
955+
http.headers.accept = format!("{:#?}", request.headers()["accept"]),
956+
http.headers.host = format!("{:#?}", request.headers()["host"]),
957+
//http.headers.user_agent = format!("{:#?}", request.headers()["user-agent"]),
958+
),
959+
))]
890960
async fn http_request_handle<C: ServerContext>(
891961
server: Arc<DropshotState<C>>,
892962
request: Request<hyper::body::Incoming>,
893963
request_id: &str,
894964
request_log: Logger,
895965
remote_addr: std::net::SocketAddr,
966+
#[cfg(feature = "otel-tracing")]
967+
span_context: opentelemetry::trace::SpanContext,
896968
) -> Result<Response<Body>, HandlerError> {
897969
// TODO-hardening: is it correct to (and do we correctly) read the entire
898970
// request body even if we decide it's too large and are going to send a 400
@@ -916,6 +988,8 @@ async fn http_request_handle<C: ServerContext>(
916988
endpoint: lookup_result.endpoint,
917989
request_id: request_id.to_string(),
918990
log: request_log,
991+
#[cfg(feature = "otel-tracing")]
992+
span_context: span_context,
919993
};
920994
let handler = lookup_result.handler;
921995

0 commit comments

Comments
 (0)