Skip to content

Commit 99398aa

Browse files
taloricsharang
authored andcommitted
fix: add x_request_id for server_span match
1 parent e568b66 commit 99398aa

File tree

1 file changed

+59
-9
lines changed

1 file changed

+59
-9
lines changed

app/app/application/l7_flow_tracing.py

+59-9
Original file line numberDiff line numberDiff line change
@@ -1547,8 +1547,11 @@ def __init__(self, group_key: str):
15471547
self.app_span_roots: List[SpanNode] = None
15481548
# 用于存放 `app_span` 的所有 leaf
15491549
self.app_span_leafs: List[SpanNode] = None
1550+
# 记录叶子节点的 syscall_trace_id, 用以匹配 s-p root
15501551
self.leaf_syscall_trace_id_request: Set[int] = set()
15511552
self.leaf_syscall_trace_id_response: Set[int] = set()
1553+
# 记录叶子节点的 x_request_id => index (in self.spans), 用以匹配 s-p root
1554+
self.leaf_x_request_id: Dict[str, List[int]] = {}
15521555
# 用于显示调用拓扑使用
15531556
self.subnet_id = None
15541557
self.subnet = None
@@ -1659,10 +1662,23 @@ def append_sys_span(self, sys_span: SysSpanNode):
16591662
self._set_extra_value_for_sys_span(sys_span)
16601663
self._set_auto_service(sys_span)
16611664
if sys_span.tap_side == TAP_SIDE_CLIENT_PROCESS:
1662-
self.leaf_syscall_trace_id_request.add(
1663-
sys_span.get_syscall_trace_id_request())
1664-
self.leaf_syscall_trace_id_response.add(
1665-
sys_span.get_syscall_trace_id_response())
1665+
cp_syscall_trace_id_req = sys_span.get_syscall_trace_id_request()
1666+
cp_syscall_trace_id_res = sys_span.get_syscall_trace_id_response()
1667+
cp_x_request_id_0 = sys_span.get_x_request_id_0()
1668+
cp_x_request_id_1 = sys_span.get_x_request_id_1()
1669+
if cp_syscall_trace_id_req:
1670+
self.leaf_syscall_trace_id_request.add(
1671+
sys_span.get_syscall_trace_id_request())
1672+
if cp_syscall_trace_id_res:
1673+
self.leaf_syscall_trace_id_response.add(
1674+
sys_span.get_syscall_trace_id_response())
1675+
if cp_x_request_id_0:
1676+
# index of sys_span = len(self.spans)-1
1677+
self.leaf_x_request_id.setdefault(
1678+
cp_x_request_id_0, []).append(len(self.spans) - 1)
1679+
if cp_x_request_id_1 and cp_x_request_id_1 != cp_x_request_id_0:
1680+
self.leaf_x_request_id.setdefault(
1681+
cp_x_request_id_1, []).append(len(self.spans) - 1)
16661682

16671683
def remove_server_sys_span(self, sys_span: SysSpanNode):
16681684
# 这里应该要做 append_sys_span 的逆操作(但对象仅为 ServerProcess sys_span)
@@ -1743,7 +1759,7 @@ def split_to_multiple_process_span_set(self) -> list:
17431759
# 极端情况下可能会有多个没有 parent_span_id 的入口,这里没法分辨它们的关系,不做拆分
17441760
if root_parent_span_id == '':
17451761
root_parent_span_id = "root" # 只是标记 root_parent_span_id,没有实际作用
1746-
if root_parent_span_id not in split_result:
1762+
if split_result.get(root_parent_span_id, None) is None:
17471763
newSet = ProcessSpanSet(root_parent_span_id)
17481764
newSet.app_span_roots = [self.spans[root_span_index]]
17491765
newSet._copy_meta_data_from(self)
@@ -1776,7 +1792,12 @@ def attach_sys_span_via_app_span(self, sys_span: SysSpanNode) -> bool:
17761792
return self._attach_client_sys_span(sys_span)
17771793

17781794
def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool:
1795+
# connection priority: span_id > syscall_trace_id > x_request_id
17791796
span_id_of_sys_span = sys_span.get_span_id()
1797+
syscall_trace_id_request = sys_span.get_syscall_trace_id_request()
1798+
syscall_trace_id_response = sys_span.get_syscall_trace_id_response()
1799+
x_request_id_0 = sys_span.get_x_request_id_0()
1800+
x_request_id_1 = sys_span.get_x_request_id_1()
17801801
if span_id_of_sys_span:
17811802
for app_root in self.app_span_roots:
17821803
if span_id_of_sys_span == app_root.get_parent_span_id():
@@ -1815,10 +1836,9 @@ def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool:
18151836
"s-p sys_span mounted due to same span_id as parent",
18161837
self.mounted_callback)
18171838
return True
1818-
else:
1819-
syscall_trace_id_request = sys_span.get_syscall_trace_id_request()
1820-
syscall_trace_id_response = sys_span.get_syscall_trace_id_response(
1821-
)
1839+
1840+
# span_id not matched, try syscall_trace_id
1841+
if syscall_trace_id_request or syscall_trace_id_response:
18221842
for app_root in self.app_span_roots:
18231843
# 如果 span_id 不存在,说明可能是入口 span,上游没有注入 span_id,此时根据叶子节点 c-p 的 syscall_trace_id 匹配即可
18241844
# 这里匹配可以严格点,s-p 和 c-p 只会同侧(req-req / res-res)相等,避免误关联一个独立的 c-p
@@ -1830,6 +1850,36 @@ def _attach_server_sys_span(self, sys_span: SysSpanNode) -> bool:
18301850
"s-p sys_span mounted due to syscall_trace_id matched c-p",
18311851
self.mounted_callback)
18321852
return True
1853+
1854+
# span_id/syscall not matched, try x_request_id
1855+
if x_request_id_0 or x_request_id_1:
1856+
# 场景:过 ingress/nginx 进入服务网关/服务,传递了 x_request_id,且作为首个 span 没有 trace_id/span_id
1857+
# 且发生跨线程调度,无法基于 syscall 关联时,允许通过 s-p.x_request_id(0/1) <=> c-p.x_request_id(0/1) 关联
1858+
# 此处已确保 auto_instance_id 一致 (即同一个进程)
1859+
1860+
# x_req_id 同侧相等: 透传 x_req_id,来自上游
1861+
# x_req_id 异侧相等: 注入 x_req_id,内部产生
1862+
x_req_id_matched = False
1863+
# 同一个进程内时间一定覆盖
1864+
for same_xreqid_idx in self.leaf_x_request_id.get(
1865+
x_request_id_0, []):
1866+
if sys_span.time_range_cover(self.spans[same_xreqid_idx]):
1867+
x_req_id_matched = True
1868+
if not x_req_id_matched:
1869+
for same_xreqid_idx in self.leaf_x_request_id.get(
1870+
x_request_id_1, []):
1871+
if sys_span.time_range_cover(self.spans[same_xreqid_idx]):
1872+
x_req_id_matched = True
1873+
if x_req_id_matched:
1874+
for app_root in self.app_span_roots:
1875+
if app_root.get_parent_id() < 0:
1876+
self.append_sys_span(sys_span)
1877+
app_root.set_parent(
1878+
sys_span,
1879+
"s-p sys_span mounted due to x_request_id matched c-p",
1880+
self.mounted_callback)
1881+
return True
1882+
18331883
return False
18341884

18351885
def _attach_client_sys_span(self, sys_span: SysSpanNode) -> bool:

0 commit comments

Comments
 (0)