Skip to content

Commit 16a29a8

Browse files
ds-filipknefelFilip Knefelmpolomdeepsense
authored
chore: extend logging (#96)
- Added logs to split page hook to improve transparency on the process. - Changed log split hook member variable for clarity. - Added retry counter to retry hook logs. - Moved logging configuration to `__init__` from the `sdk_init` method of a single hook and removed stdout as target for the logger. Logs now go to stderr (the default). Tested on `_sample_docs/layout-parser-paper-fast.pdf` and `_sample_docs/layout-parser-paper.pdf`. First one will log that it cannot be split if run with pdf split. --------- Co-authored-by: Filip Knefel <[email protected]> Co-authored-by: Marek Połom <[email protected]>
1 parent cf36a97 commit 16a29a8

File tree

8 files changed

+165
-85
lines changed

8 files changed

+165
-85
lines changed

_test_unstructured_client/unit/test_custom_hooks.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ def test_unit_retry_with_backoff_does_retry(caplog):
4242
assert len(mock.request_history) > 1
4343

4444

45-
def test_unit_backoff_strategy_logs_retries(caplog):
45+
@pytest.mark.parametrize("status_code", [500, 503])
46+
def test_unit_backoff_strategy_logs_retries(status_code: int, caplog):
4647
caplog.set_level(logging.INFO)
4748
filename = "README.md"
4849
backoff_strategy = BackoffStrategy(
@@ -53,8 +54,8 @@ def test_unit_backoff_strategy_logs_retries(caplog):
5354
)
5455

5556
with requests_mock.Mocker() as mock:
56-
# mock a 500 status code for POST requests to the api
57-
mock.post("https://api.unstructured.io/general/v0/general", status_code=500)
57+
# mock a 500/503 status code for POST requests to the api
58+
mock.post("https://api.unstructured.io/general/v0/general", status_code=status_code)
5859
session = UnstructuredClient(api_key_auth=FAKE_KEY)
5960

6061
with open(filename, "rb") as f:
@@ -63,7 +64,8 @@ def test_unit_backoff_strategy_logs_retries(caplog):
6364
req = shared.PartitionParameters(files=files)
6465
with pytest.raises(Exception):
6566
session.general.partition(req, retries=retries)
66-
pattern = re.compile("Response status code: 500. Sleeping before retry.")
67+
pattern = re.compile(f"Failed to process a request due to API server error with status code {status_code}. "
68+
"Attempting retry number 1 after sleep.")
6769
assert bool(pattern.search(caplog.text))
6870

6971

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,18 @@ async def example():
4343
pass
4444

4545
hook.coroutines_to_execute[operation_id] = [example(), example()]
46-
hook.api_responses[operation_id] = [
46+
hook.api_successful_responses[operation_id] = [
4747
requests.Response(),
4848
requests.Response(),
4949
]
5050

5151
assert len(hook.coroutines_to_execute[operation_id]) == 2
52-
assert len(hook.api_responses[operation_id]) == 2
52+
assert len(hook.api_successful_responses[operation_id]) == 2
5353

5454
hook._clear_operation(operation_id)
5555

5656
assert hook.coroutines_to_execute.get(operation_id) is None
57-
assert hook.api_responses.get(operation_id) is None
57+
assert hook.api_successful_responses.get(operation_id) is None
5858

5959

6060
def test_unit_prepare_request_payload():
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from .clean_server_url_hook import CleanServerUrlSDKInitHook
2-
from .log_retries_after_error_hook import LogRetriesAfterErrorHook
2+
from .logger_hook import LoggerHook
33
from .suggest_defining_url import SuggestDefiningUrlIf401AfterErrorHook
44
from .split_pdf_hook import SplitPdfHook
5+
import logging

src/unstructured_client/_hooks/custom/log_retries_after_error_hook.py

Lines changed: 0 additions & 47 deletions
This file was deleted.
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import logging
2+
from typing import Optional, Tuple, Union, DefaultDict
3+
4+
import requests
5+
6+
from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
7+
from unstructured_client._hooks.types import (
8+
AfterSuccessContext,
9+
AfterErrorContext,
10+
AfterErrorHook,
11+
SDKInitHook,
12+
)
13+
from collections import defaultdict
14+
15+
logger = logging.getLogger(UNSTRUCTURED_CLIENT_LOGGER_NAME)
16+
17+
18+
class LoggerHook(AfterErrorHook, SDKInitHook):
19+
"""Hook providing custom logging"""
20+
21+
def __init__(self) -> None:
22+
self.retries_counter: DefaultDict[str, int] = defaultdict(int)
23+
24+
def log_retries(self, response: Optional[requests.Response], operation_id: str):
25+
"""Log retries to give users visibility into requests."""
26+
27+
if response is not None and response.status_code // 100 == 5:
28+
logger.info(
29+
"Failed to process a request due to API server error with status code %d. "
30+
"Attempting retry number %d after sleep.",
31+
response.status_code,
32+
self.retries_counter[operation_id],
33+
)
34+
if response.text:
35+
logger.info("Server message - %s", response.text)
36+
37+
def sdk_init(
38+
self, base_url: str, client: requests.Session
39+
) -> Tuple[str, requests.Session]:
40+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
41+
return base_url, client
42+
43+
def after_success(
44+
self, hook_ctx: AfterSuccessContext, response: requests.Response
45+
) -> Union[requests.Response, Exception]:
46+
del self.retries_counter[hook_ctx.operation_id]
47+
return response
48+
49+
def after_error(
50+
self,
51+
hook_ctx: AfterErrorContext,
52+
response: Optional[requests.Response],
53+
error: Optional[Exception],
54+
) -> Union[Tuple[Optional[requests.Response], Optional[Exception]], Exception]:
55+
"""Concrete implementation for AfterErrorHook."""
56+
self.retries_counter[hook_ctx.operation_id] += 1
57+
self.log_retries(response, hook_ctx.operation_id)
58+
return response, error

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,9 @@ def call_api(
108108
return requests.Response()
109109

110110

111-
def prepare_request_headers(headers: CaseInsensitiveDict[str]) -> CaseInsensitiveDict[str]:
111+
def prepare_request_headers(
112+
headers: CaseInsensitiveDict[str],
113+
) -> CaseInsensitiveDict[str]:
112114
"""Prepare the request headers by removing the 'Content-Type' and 'Content-Length' headers.
113115
114116
Args:
@@ -161,3 +163,16 @@ def create_response(response: requests.Response, elements: list) -> requests.Res
161163
response_copy.headers.update({"Content-Length": content_length})
162164
setattr(response_copy, "_content", content)
163165
return response_copy
166+
167+
168+
def log_after_split_response(status_code: int, split_number: int):
169+
if status_code == 200:
170+
logger.info(
171+
"Successfully partitioned set #%d, elements added to the final result.",
172+
split_number,
173+
)
174+
else:
175+
logger.warning(
176+
"Failed to partition set #%d, its elements will be omitted in the final result.",
177+
split_number,
178+
)

0 commit comments

Comments
 (0)