Skip to content

Commit de0f9a1

Browse files
authored
feat: add lazy loading (#81)
* feat: add lazy GraphQL fields This commit introduces lazy fetching of GraphQL fields. Now, the `GraphQLFragmentMixin.get_fragments()` method has a new `lazy` argument, which will make it skip certain fields that are considered "large". A field is lazy-loadable when: 1. It is a `List`, `Union` or `Optional` of `GraphQLFragmentMixin`. 2. It is not marked as `NOT_LAZY`. This will make a difference when fetching things like metrics. In "eager" mode, the client will fetch all subfields of each metrics, including dimensions and entities, which makes the response potentially very large. Now, if the client is "lazy", the `.metrics()` method will only return the metrics themselves, and the `dimensions` and `entities` fields will be empty. Certain things like saved query exports don't need lazy fields as their child objects are not large, so it's worth it to just fetch everything in one go. I added two tests for this functionality. One is to make sure that the `get_fragments()` method returns the expected GraphQL fragments for lazy fields. The other is to ensure that all lazy-loadable fields have a default value which can be used to initialize the property locally when it's not initialized from server data. In the next commit, I'll wire this through the client to make it actually work in the APIs. * feat: added `_client` to all GraphQL models This commit adds a private `_client` property to all `GraphQLFragmentMixin` which will get auto-populated by the loading client. This is so that methods such as `Metric.load_dimensions()` will be able to refer back to the client to make requests. * feat: added `SyncMetric` and `AsyncMetric` This is for type checking * docs: changie * docs: add example for lazy loading * docs: add lazy loading to README * fix: small error in README and `_attach_self_to_parsed_response` * test: remove useless test
1 parent 8e317c9 commit de0f9a1

28 files changed

+558
-105
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
kind: Features
2+
body: Add `lazy` parameter to clients which allows lazy loading of certain model fields.
3+
time: 2025-04-14T17:09:39.64588+02:00

README.md

+6
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,12 @@ arrow_table = client.query(...)
9292
polars_df = pl.from_arrow(arrow_table)
9393
```
9494

95+
### Lazy loading
96+
97+
By default, the SDK will eagerly request for lists of nested objects. For example, in the list of `Metric` returned by `client.metrics()`, each metric will contain the list of its dimensions, entities and measures. This is convenient in most cases, but can make your returned data really large in case your project is really large, which can slow things down.
98+
99+
It is possible to set the client to `lazy=True`, which will make it skip populating nested object lists unless you explicitly load ask for it on a per-model basis. Check our [lazy loading example](./examples/list_metrics_lazy_sync.py) to learn more.
100+
95101
### More examples
96102

97103
Check out our [usage examples](./examples/) to learn more.

dbtsl/api/graphql/client/asyncio.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ def __init__(
4949
auth_token: str,
5050
url_format: Optional[str] = None,
5151
timeout: Optional[Union[TimeoutOptions, float, int]] = None,
52+
*,
53+
lazy: bool,
5254
):
5355
"""Initialize the metadata client.
5456
@@ -60,12 +62,13 @@ def __init__(
6062
into a full URL. If `None`, the default `https://{server_host}/api/graphql`
6163
will be assumed.
6264
timeout: TimeoutOptions or total timeout (in seconds) for all GraphQL requests.
65+
lazy: Whether to lazy load large subfields
6366
6467
NOTE: If `timeout` is a `TimeoutOptions`, the `connect_timeout` will not be used, due to
6568
limitations of `gql`'s `aiohttp` transport.
6669
See: https://github.com/graphql-python/gql/blob/b066e8944b0da0a4bbac6c31f43e5c3c7772cd51/gql/transport/aiohttp.py#L110
6770
"""
68-
super().__init__(server_host, environment_id, auth_token, url_format, timeout)
71+
super().__init__(server_host, environment_id, auth_token, url_format, timeout, lazy=lazy)
6972

7073
@override
7174
def _create_transport(self, url: str, headers: Dict[str, str]) -> AIOHTTPTransport:
@@ -97,7 +100,7 @@ async def session(self) -> AsyncIterator[Self]:
97100

98101
async def _run(self, op: ProtocolOperation[TVariables, TResponse], raw_variables: TVariables) -> TResponse:
99102
"""Run a `ProtocolOperation`."""
100-
raw_query = op.get_request_text()
103+
raw_query = op.get_request_text(lazy=self.lazy)
101104
variables = op.get_request_variables(environment_id=self.environment_id, variables=raw_variables)
102105
gql_query = gql(raw_query)
103106

@@ -114,7 +117,9 @@ async def _run(self, op: ProtocolOperation[TVariables, TResponse], raw_variables
114117
except Exception as err:
115118
raise self._refine_err(err)
116119

117-
return op.parse_response(res)
120+
resp = op.parse_response(res)
121+
self._attach_self_to_parsed_response(resp)
122+
return resp
118123

119124
async def _poll_until_complete(
120125
self,

dbtsl/api/graphql/client/asyncio.pyi

+4-2
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ from typing_extensions import AsyncIterator, Unpack, overload
88

99
from dbtsl.api.shared.query_params import GroupByParam, OrderByGroupBy, OrderByMetric, QueryParameters
1010
from dbtsl.models import (
11+
AsyncMetric,
1112
Dimension,
1213
Entity,
1314
Measure,
14-
Metric,
1515
SavedQuery,
1616
)
1717
from dbtsl.timeout import TimeoutOptions
@@ -24,11 +24,13 @@ class AsyncGraphQLClient:
2424
auth_token: str,
2525
url_format: Optional[str] = None,
2626
timeout: Optional[Union[TimeoutOptions, float, int]] = None,
27+
*,
28+
lazy: bool,
2729
) -> None: ...
2830
def session(self) -> AbstractAsyncContextManager[AsyncIterator[Self]]: ...
2931
@property
3032
def has_session(self) -> bool: ...
31-
async def metrics(self) -> List[Metric]:
33+
async def metrics(self) -> List[AsyncMetric]:
3234
"""Get a list of all available metrics."""
3335
...
3436

dbtsl/api/graphql/client/base.py

+19
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
)
1414
from dbtsl.backoff import ExponentialBackoff
1515
from dbtsl.error import AuthError
16+
from dbtsl.models.base import GraphQLFragmentMixin
1617
from dbtsl.timeout import TimeoutOptions
1718

1819
TTransport = TypeVar("TTransport", Transport, AsyncTransport)
@@ -59,8 +60,11 @@ def __init__( # noqa: D107
5960
auth_token: str,
6061
url_format: Optional[str] = None,
6162
timeout: Optional[Union[TimeoutOptions, float, int]] = None,
63+
*,
64+
lazy: bool,
6265
):
6366
self.environment_id = environment_id
67+
self.lazy = lazy
6468

6569
url_format = url_format or self.DEFAULT_URL_FORMAT
6670
server_url = url_format.format(server_host=server_host)
@@ -101,6 +105,18 @@ def _refine_err(self, err: Exception) -> Exception:
101105

102106
return err
103107

108+
def _attach_self_to_parsed_response(self, resp: object) -> None:
109+
# NOTE: we're setting the _client_unchecked here instead of making a public property
110+
# because we don't want end-users to be aware of this. You can consider _client_unchecked
111+
# as public to the module but not to end users
112+
if isinstance(resp, GraphQLFragmentMixin):
113+
resp._client_unchecked = self # type: ignore
114+
return
115+
116+
if isinstance(resp, list):
117+
for v in resp: # pyright: ignore[reportUnknownVariableType]
118+
self._attach_self_to_parsed_response(v) # pyright: ignore[reportUnknownArgumentType]
119+
104120
@property
105121
def _gql_session(self) -> TSession:
106122
"""Safe accessor to `_gql_session_unsafe`.
@@ -145,6 +161,8 @@ def __call__(
145161
auth_token: str,
146162
url_format: Optional[str] = None,
147163
timeout: Optional[Union[TimeoutOptions, float, int]] = None,
164+
*,
165+
lazy: bool,
148166
) -> TClient:
149167
"""Initialize the Semantic Layer client.
150168
@@ -154,5 +172,6 @@ def __call__(
154172
auth_token: the API auth token
155173
url_format: the URL format string to construct the final URL with
156174
timeout: `TimeoutOptions` or total timeout
175+
lazy: lazy load large fields
157176
"""
158177
pass

dbtsl/api/graphql/client/sync.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ def __init__(
3838
auth_token: str,
3939
url_format: Optional[str] = None,
4040
timeout: Optional[Union[TimeoutOptions, float, int]] = None,
41+
*,
42+
lazy: bool,
4143
):
4244
"""Initialize the metadata client.
4345
@@ -49,11 +51,12 @@ def __init__(
4951
into a full URL. If `None`, the default `https://{server_host}/api/graphql`
5052
will be assumed.
5153
timeout: TimeoutOptions or total timeout (in seconds) for all GraphQL requests.
54+
lazy: Whether to lazy load large subfields
5255
5356
NOTE: If `timeout` is a `TimeoutOptions`, the `tls_close_timeout` will not be used, since
5457
`requests` does not support TLS termination timeouts.
5558
"""
56-
super().__init__(server_host, environment_id, auth_token, url_format, timeout)
59+
super().__init__(server_host, environment_id, auth_token, url_format, timeout, lazy=lazy)
5760

5861
@override
5962
def _create_transport(self, url: str, headers: Dict[str, str]) -> RequestsHTTPTransport:
@@ -85,7 +88,7 @@ def session(self) -> Iterator[Self]:
8588

8689
def _run(self, op: ProtocolOperation[TVariables, TResponse], raw_variables: TVariables) -> TResponse:
8790
"""Run a `ProtocolOperation`."""
88-
raw_query = op.get_request_text()
91+
raw_query = op.get_request_text(lazy=self.lazy)
8992
variables = op.get_request_variables(environment_id=self.environment_id, variables=raw_variables)
9093
gql_query = gql(raw_query)
9194

@@ -98,7 +101,9 @@ def _run(self, op: ProtocolOperation[TVariables, TResponse], raw_variables: TVar
98101
except Exception as err:
99102
raise self._refine_err(err)
100103

101-
return op.parse_response(res)
104+
resp = op.parse_response(res)
105+
self._attach_self_to_parsed_response(resp)
106+
return resp
102107

103108
def _poll_until_complete(
104109
self,

dbtsl/api/graphql/client/sync.pyi

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ from dbtsl.models import (
1111
Dimension,
1212
Entity,
1313
Measure,
14-
Metric,
1514
SavedQuery,
15+
SyncMetric,
1616
)
1717
from dbtsl.timeout import TimeoutOptions
1818

@@ -24,11 +24,13 @@ class SyncGraphQLClient:
2424
auth_token: str,
2525
url_format: Optional[str] = None,
2626
timeout: Optional[Union[TimeoutOptions, float, int]] = None,
27+
*,
28+
lazy: bool,
2729
) -> None: ...
2830
def session(self) -> AbstractContextManager[Iterator[Self]]: ...
2931
@property
3032
def has_session(self) -> bool: ...
31-
def metrics(self) -> List[Metric]:
33+
def metrics(self) -> List[SyncMetric]:
3234
"""Get a list of all available metrics."""
3335
...
3436

dbtsl/api/graphql/protocol.py

+15-15
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class ProtocolOperation(Generic[TVariables, TResponse], ABC):
4646
"""Base class for GraphQL API operations."""
4747

4848
@abstractmethod
49-
def get_request_text(self) -> str:
49+
def get_request_text(self, *, lazy: bool) -> str:
5050
"""Get the GraphQL request text."""
5151
raise NotImplementedError()
5252

@@ -71,15 +71,15 @@ class ListMetricsOperation(ProtocolOperation[EmptyVariables, List[Metric]]):
7171
"""List all available metrics in available in the Semantic Layer."""
7272

7373
@override
74-
def get_request_text(self) -> str:
74+
def get_request_text(self, *, lazy: bool) -> str:
7575
query = """
7676
query getMetrics($environmentId: BigInt!) {
7777
metrics(environmentId: $environmentId) {
7878
...&fragment
7979
}
8080
}
8181
"""
82-
return render_query(query, Metric.gql_fragments())
82+
return render_query(query, Metric.gql_fragments(lazy=lazy))
8383

8484
@override
8585
def get_request_variables(self, environment_id: int, variables: EmptyVariables) -> Dict[str, Any]:
@@ -100,15 +100,15 @@ class ListDimensionsOperation(ProtocolOperation[ListEntitiesOperationVariables,
100100
"""List all dimensions for a given set of metrics."""
101101

102102
@override
103-
def get_request_text(self) -> str:
103+
def get_request_text(self, *, lazy: bool) -> str:
104104
query = """
105105
query getDimensions($environmentId: BigInt!, $metrics: [MetricInput!]!) {
106106
dimensions(environmentId: $environmentId, metrics: $metrics) {
107107
...&fragment
108108
}
109109
}
110110
"""
111-
return render_query(query, Dimension.gql_fragments())
111+
return render_query(query, Dimension.gql_fragments(lazy=lazy))
112112

113113
@override
114114
def get_request_variables(self, environment_id: int, variables: ListEntitiesOperationVariables) -> Dict[str, Any]:
@@ -126,15 +126,15 @@ class ListMeasuresOperation(ProtocolOperation[ListEntitiesOperationVariables, Li
126126
"""List all measures for a given set of metrics."""
127127

128128
@override
129-
def get_request_text(self) -> str:
129+
def get_request_text(self, *, lazy: bool) -> str:
130130
query = """
131131
query getMeasures($environmentId: BigInt!, $metrics: [MetricInput!]!) {
132132
measures(environmentId: $environmentId, metrics: $metrics) {
133133
...&fragment
134134
}
135135
}
136136
"""
137-
return render_query(query, Measure.gql_fragments())
137+
return render_query(query, Measure.gql_fragments(lazy=lazy))
138138

139139
@override
140140
def get_request_variables(self, environment_id: int, variables: ListEntitiesOperationVariables) -> Dict[str, Any]:
@@ -152,15 +152,15 @@ class ListEntitiesOperation(ProtocolOperation[ListEntitiesOperationVariables, Li
152152
"""List all entities for a given set of metrics."""
153153

154154
@override
155-
def get_request_text(self) -> str:
155+
def get_request_text(self, *, lazy: bool) -> str:
156156
query = """
157157
query getEntities($environmentId: BigInt!, $metrics: [MetricInput!]!) {
158158
entities(environmentId: $environmentId, metrics: $metrics) {
159159
...&fragment
160160
}
161161
}
162162
"""
163-
return render_query(query, Entity.gql_fragments())
163+
return render_query(query, Entity.gql_fragments(lazy=lazy))
164164

165165
@override
166166
def get_request_variables(self, environment_id: int, variables: ListEntitiesOperationVariables) -> Dict[str, Any]:
@@ -178,15 +178,15 @@ class ListSavedQueriesOperation(ProtocolOperation[EmptyVariables, List[SavedQuer
178178
"""List all saved queries."""
179179

180180
@override
181-
def get_request_text(self) -> str:
181+
def get_request_text(self, *, lazy: bool) -> str:
182182
query = """
183183
query getSavedQueries($environmentId: BigInt!) {
184184
savedQueries(environmentId: $environmentId) {
185185
...&fragment
186186
}
187187
}
188188
"""
189-
return render_query(query, SavedQuery.gql_fragments())
189+
return render_query(query, SavedQuery.gql_fragments(lazy=lazy))
190190

191191
@override
192192
def get_request_variables(self, environment_id: int, variables: EmptyVariables) -> Dict[str, Any]:
@@ -242,7 +242,7 @@ class CreateQueryOperation(ProtocolOperation[QueryParameters, QueryId]):
242242
"""Create a query that will be processed asynchronously."""
243243

244244
@override
245-
def get_request_text(self) -> str:
245+
def get_request_text(self, *, lazy: bool) -> str:
246246
query = """
247247
mutation createQuery(
248248
$environmentId: BigInt!,
@@ -290,7 +290,7 @@ class GetQueryResultOperation(ProtocolOperation[GetQueryResultVariables, QueryRe
290290
"""Get the results of a query that was already created."""
291291

292292
@override
293-
def get_request_text(self) -> str:
293+
def get_request_text(self, *, lazy: bool) -> str:
294294
query = """
295295
query getQueryResults(
296296
$environmentId: BigInt!,
@@ -302,7 +302,7 @@ def get_request_text(self) -> str:
302302
}
303303
}
304304
"""
305-
return render_query(query, QueryResult.gql_fragments())
305+
return render_query(query, QueryResult.gql_fragments(lazy=lazy))
306306

307307
@override
308308
def get_request_variables(self, environment_id: int, variables: GetQueryResultVariables) -> Dict[str, Any]:
@@ -321,7 +321,7 @@ class CompileSqlOperation(ProtocolOperation[QueryParameters, str]):
321321
"""Get the compiled SQL that would be sent to the warehouse by a query."""
322322

323323
@override
324-
def get_request_text(self) -> str:
324+
def get_request_text(self, *, lazy: bool) -> str:
325325
query = """
326326
mutation compileSql(
327327
$environmentId: BigInt!,

dbtsl/client/asyncio.py

+4
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ def __init__(
2626
auth_token: str,
2727
host: str,
2828
timeout: Optional[Union[TimeoutOptions, float, int]] = None,
29+
*,
30+
lazy: bool = False,
2931
) -> None:
3032
"""Initialize the Semantic Layer client.
3133
@@ -34,6 +36,7 @@ def __init__(
3436
auth_token: the API auth token
3537
host: the Semantic Layer API host
3638
timeout: `TimeoutOptions` or total timeout for the underlying GraphQL client.
39+
lazy: if true, nested metadata queries will be need to be explicitly populated on-demand.
3740
"""
3841
super().__init__(
3942
environment_id=environment_id,
@@ -42,6 +45,7 @@ def __init__(
4245
gql_factory=AsyncGraphQLClient,
4346
adbc_factory=AsyncADBCClient,
4447
timeout=timeout,
48+
lazy=lazy,
4549
)
4650

4751
@asynccontextmanager

0 commit comments

Comments
 (0)