Skip to content

Commit 3ab7a05

Browse files
taloricsharang
authored andcommitted
fix: add x_request_id for server_span match
1 parent 2da2a08 commit 3ab7a05

File tree

1 file changed

+59
-9
lines changed

1 file changed

+59
-9
lines changed

app/app/application/l7_flow_tracing.py

+59-9
Original file line numberDiff line numberDiff line change
@@ -1442,8 +1442,11 @@ def __init__(self, group_key: str):
14421442
self.app_span_roots: List[SpanNode] = None
14431443
# 用于存放 `app_span` 的所有 leaf
14441444
self.app_span_leafs: List[SpanNode] = None
1445+
# 记录叶子节点的 syscall_trace_id, 用以匹配 s-p root
14451446
self.leaf_syscall_trace_id_request: Set[int] = set()
14461447
self.leaf_syscall_trace_id_response: Set[int] = set()
1448+
# 记录叶子节点的 x_request_id => index (in self.spans), 用以匹配 s-p root
1449+
self.leaf_x_request_id: Dict[str, List[int]] = {}
14471450
# 用于显示调用拓扑使用
14481451
self.subnet_id = None
14491452
self.subnet = None
@@ -1552,10 +1555,23 @@ def append_sys_span(self, sys_span: SysSpanNode):
15521555
self._set_extra_value_for_sys_span(sys_span)
15531556
self._set_auto_service(sys_span)
15541557
if sys_span.tap_side == TAP_SIDE_CLIENT_PROCESS:
1555-
self.leaf_syscall_trace_id_request.add(
1556-
sys_span.get_syscall_trace_id_request())
1557-
self.leaf_syscall_trace_id_response.add(
1558-
sys_span.get_syscall_trace_id_response())
1558+
cp_syscall_trace_id_req = sys_span.get_syscall_trace_id_request()
1559+
cp_syscall_trace_id_res = sys_span.get_syscall_trace_id_response()
1560+
cp_x_request_id_0 = sys_span.get_x_request_id_0()
1561+
cp_x_request_id_1 = sys_span.get_x_request_id_1()
1562+
if cp_syscall_trace_id_req:
1563+
self.leaf_syscall_trace_id_request.add(
1564+
sys_span.get_syscall_trace_id_request())
1565+
if cp_syscall_trace_id_res:
1566+
self.leaf_syscall_trace_id_response.add(
1567+
sys_span.get_syscall_trace_id_response())
1568+
if cp_x_request_id_0:
1569+
# index of sys_span = len(self.spans)-1
1570+
self.leaf_x_request_id.setdefault(
1571+
cp_x_request_id_0, []).append(len(self.spans) - 1)
1572+
if cp_x_request_id_1 and cp_x_request_id_1 != cp_x_request_id_0:
1573+
self.leaf_x_request_id.setdefault(
1574+
cp_x_request_id_1, []).append(len(self.spans) - 1)
15591575

15601576
def remove_server_sys_span(self, sys_span: SysSpanNode):
15611577
# 这里应该要做 append_sys_span 的逆操作(但对象仅为 ServerProcess sys_span)
@@ -1635,7 +1651,7 @@ def split_to_multiple_process_span_set(self) -> list:
16351651
# 极端情况下可能会有多个没有 parent_span_id 的入口,这里没法分辨它们的关系,不做拆分
16361652
if root_parent_span_id == '':
16371653
root_parent_span_id = "root" # 只是标记 root_parent_span_id,没有实际作用
1638-
if root_parent_span_id not in split_result:
1654+
if split_result.get(root_parent_span_id, None) is None:
16391655
newSet = ProcessSpanSet(root_parent_span_id)
16401656
newSet.app_span_roots = [self.spans[root_span_index]]
16411657
newSet._copy_meta_data_from(self)
@@ -1668,7 +1684,12 @@ def attach_sys_span_via_app_span(self, sys_span: SysSpanNode) -> bool:
16681684
return self._attach_client_sys_span(sys_span)
16691685

16701686
def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool:
1687+
# connection priority: span_id > syscall_trace_id > x_request_id
16711688
span_id_of_sys_span = sys_span.get_span_id()
1689+
syscall_trace_id_request = sys_span.get_syscall_trace_id_request()
1690+
syscall_trace_id_response = sys_span.get_syscall_trace_id_response()
1691+
x_request_id_0 = sys_span.get_x_request_id_0()
1692+
x_request_id_1 = sys_span.get_x_request_id_1()
16721693
if span_id_of_sys_span:
16731694
for app_root in self.app_span_roots:
16741695
if span_id_of_sys_span == app_root.get_parent_span_id():
@@ -1707,10 +1728,9 @@ def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool:
17071728
"s-p sys_span mounted due to same span_id as parent"
17081729
)
17091730
return True
1710-
else:
1711-
syscall_trace_id_request = sys_span.get_syscall_trace_id_request()
1712-
syscall_trace_id_response = sys_span.get_syscall_trace_id_response(
1713-
)
1731+
1732+
# span_id not matched, try syscall_trace_id
1733+
if syscall_trace_id_request or syscall_trace_id_response:
17141734
for app_root in self.app_span_roots:
17151735
# 如果 span_id 不存在,说明可能是入口 span,上游没有注入 span_id,此时根据叶子节点 c-p 的 syscall_trace_id 匹配即可
17161736
# 这里匹配可以严格点,s-p 和 c-p 只会同侧(req-req / res-res)相等,避免误关联一个独立的 c-p
@@ -1722,6 +1742,36 @@ def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool:
17221742
"s-p sys_span mounted due to syscall_trace_id matched c-p"
17231743
)
17241744
return True
1745+
1746+
# span_id/syscall not matched, try x_request_id
1747+
if x_request_id_0 or x_request_id_1:
1748+
# 场景:过 ingress/nginx 进入服务网关/服务,传递了 x_request_id,且作为首个 span 没有 trace_id/span_id
1749+
# 且发生跨线程调度,无法基于 syscall 关联时,允许通过 s-p.x_request_id(0/1) <=> c-p.x_request_id(0/1) 关联
1750+
# 此处已确保 auto_instance_id 一致 (即同一个进程)
1751+
1752+
# x_req_id 同侧相等: 透传 x_req_id,来自上游
1753+
# x_req_id 异侧相等: 注入 x_req_id,内部产生
1754+
x_req_id_matched = False
1755+
# 同一个进程内时间一定覆盖
1756+
for same_xreqid_idx in self.leaf_x_request_id.get(
1757+
x_request_id_0, []):
1758+
if sys_span.time_range_cover(self.spans[same_xreqid_idx]):
1759+
x_req_id_matched = True
1760+
if not x_req_id_matched:
1761+
for same_xreqid_idx in self.leaf_x_request_id.get(
1762+
x_request_id_1, []):
1763+
if sys_span.time_range_cover(self.spans[same_xreqid_idx]):
1764+
x_req_id_matched = True
1765+
if x_req_id_matched:
1766+
for app_root in self.app_span_roots:
1767+
if app_root.get_parent_id() < 0:
1768+
self.append_sys_span(sys_span)
1769+
app_root.set_parent(
1770+
sys_span,
1771+
"s-p sys_span mounted due to x_request_id matched c-p",
1772+
self.mounted_callback)
1773+
return True
1774+
17251775
return False
17261776

17271777
def _attach_client_sys_span(self, sys_span: SysSpanNode) -> bool:

0 commit comments

Comments
 (0)