Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 38 additions & 7 deletions snuba/utils/metrics/backends/datadog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
import threading
from typing import Callable, Mapping, Optional, Sequence, Union

Expand All @@ -8,6 +9,8 @@
from snuba.utils.metrics.backends.abstract import MetricsBackend
from snuba.utils.metrics.types import Tags

logger = logging.getLogger(__name__)


class DatadogMetricsBackend(MetricsBackend):
"""
Expand All @@ -33,11 +36,24 @@ def __init__(
self.__thread_state = threading.local()

@property
def __client(self) -> DogStatsd:
def __client(self) -> Optional[DogStatsd]:
try:
client = self.__thread_state.client
except AttributeError:
client = self.__thread_state.client = self.__client_factory()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Outer except only catches AttributeError, not SystemError

High Severity

The PR description states the issue is that threading.local() client access during ThreadPoolExecutor shutdown causes a C-level SystemError. However, the outer try/except on line 40-42 only catches AttributeError. If the SystemError occurs when reading self.__thread_state.client (line 41) — which is exactly the scenario described — it won't be caught, and the crash the PR intends to fix will still happen. The inner except Exception only protects the factory call path, not the attribute access path. The outer except needs to also catch Exception (or at least SystemError) to fully address the described issue.

Fix in Cursor Fix in Web

try:
client = self.__thread_state.client = self.__client_factory()
except Exception as e:
# During thread cleanup (e.g., ThreadPoolExecutor shutdown), the client
# factory may fail or set an exception while returning a value, causing
# a SystemError. We catch this and log it, returning None to allow
# metrics calls to fail silently rather than crashing the application.
logger.warning(
"Failed to create DogStatsd client in thread %s: %s",
threading.current_thread().name,
e,
exc_info=True,
)
return None
Comment on lines +46 to +56
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: If __client_factory fails, self.__thread_state.client is not set. Subsequent metric calls in the same thread will repeatedly and unsuccessfully attempt to create the client, causing log spam.
Severity: CRITICAL

Suggested Fix

In the except Exception block where the client factory failure is handled, explicitly set self.__thread_state.client = None. This ensures the attribute is set, preventing subsequent calls from re-triggering the client creation logic and the associated AttributeError.

Prompt for AI Agent
Review the code at the location below. A potential bug has been identified by an AI
agent.
Verify if this is a real issue. If it is, propose a fix; if not, explain why it's not
valid.

Location: snuba/utils/metrics/backends/datadog.py#L46-L56

Potential issue: In the `__client` property, if the `self.__client_factory()` call fails
with an exception, the assignment to `self.__thread_state.client` does not complete.
Because the attribute remains unset, any subsequent call to a metric method within the
same thread will trigger another `AttributeError`. This leads to a cycle where the code
repeatedly attempts to create the client, fails, and logs a warning with a full stack
trace for every single metric invocation. This will cause significant log spam and
performance degradation in high-throughput paths.

Did we get this right? 👍 / 👎 to inform future reviews.

return client

def __normalize_tags(self, tags: Optional[Tags]) -> Optional[Sequence[str]]:
Expand All @@ -53,7 +69,10 @@ def increment(
tags: Optional[Tags] = None,
unit: Optional[str] = None,
) -> None:
self.__client.increment(
client = self.__client
if client is None:
return
client.increment(
name,
value,
tags=self.__normalize_tags(tags),
Expand All @@ -67,7 +86,10 @@ def gauge(
tags: Optional[Tags] = None,
unit: Optional[str] = None,
) -> None:
self.__client.gauge(
client = self.__client
if client is None:
return
client.gauge(
name,
value,
tags=self.__normalize_tags(tags),
Expand All @@ -81,7 +103,10 @@ def timing(
tags: Optional[Tags] = None,
unit: Optional[str] = None,
) -> None:
self.__client.timing(
client = self.__client
if client is None:
return
client.timing(
name,
value,
tags=self.__normalize_tags(tags),
Expand All @@ -95,7 +120,10 @@ def distribution(
tags: Optional[Tags] = None,
unit: Optional[str] = None,
) -> None:
self.__client.distribution(
client = self.__client
if client is None:
return
client.distribution(
name,
value,
tags=self.__normalize_tags(tags),
Expand All @@ -110,7 +138,10 @@ def events(
priority: str,
tags: Optional[Tags] = None,
) -> None:
self.__client.event(
client = self.__client
if client is None:
return
client.event(
title=title,
text=text,
alert_type=alert_type,
Expand Down
Loading