Skip to content

Commit f6e0587

Browse files
committed
[FLINK-35989][Connectors/AWS] Log errors on partially failed requests for AWS Kinesis Stream sink
1 parent ca96d84 commit f6e0587

File tree

3 files changed

+469
-32
lines changed

3 files changed

+469
-32
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.flink.connector.kinesis.sink;
19+
20+
import org.apache.flink.annotation.Internal;
21+
22+
import software.amazon.awssdk.services.kinesis.KinesisAsyncClient;
23+
24+
import java.io.Closeable;
25+
26+
/**
27+
* Provider interface for KinesisAsyncClient instances. This is primarily used for testing to inject
28+
* mock clients.
29+
*/
30+
@Internal
31+
interface KinesisClientProvider extends Closeable {
32+
/**
33+
* Returns a KinesisAsyncClient instance.
34+
*
35+
* @return The KinesisAsyncClient instance
36+
*/
37+
KinesisAsyncClient get();
38+
39+
}

flink-connector-aws/flink-connector-aws-kinesis-streams/src/main/java/org/apache/flink/connector/kinesis/sink/KinesisStreamsSinkWriter.java

+174-17
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.flink.connector.kinesis.sink;
1919

2020
import org.apache.flink.annotation.Internal;
21+
import org.apache.flink.annotation.VisibleForTesting;
2122
import org.apache.flink.api.connector.sink2.Sink;
2223
import org.apache.flink.connector.aws.util.AWSClientUtil;
2324
import org.apache.flink.connector.aws.util.AWSGeneralUtil;
@@ -31,6 +32,7 @@
3132
import org.apache.flink.connector.base.sink.writer.strategy.RateLimitingStrategy;
3233
import org.apache.flink.metrics.Counter;
3334
import org.apache.flink.metrics.groups.SinkWriterMetricGroup;
35+
import org.apache.flink.util.Preconditions;
3436

3537
import org.slf4j.Logger;
3638
import org.slf4j.LoggerFactory;
@@ -42,10 +44,13 @@
4244
import software.amazon.awssdk.services.kinesis.model.PutRecordsResultEntry;
4345
import software.amazon.awssdk.services.kinesis.model.ResourceNotFoundException;
4446

47+
import java.io.IOException;
4548
import java.util.ArrayList;
4649
import java.util.Collection;
4750
import java.util.Collections;
51+
import java.util.HashMap;
4852
import java.util.List;
53+
import java.util.Map;
4954
import java.util.Properties;
5055
import java.util.concurrent.CompletableFuture;
5156
import java.util.function.Consumer;
@@ -96,11 +101,8 @@ class KinesisStreamsSinkWriter<InputT> extends AsyncSinkWriter<InputT, PutRecord
96101
/* The sink writer metric group */
97102
private final SinkWriterMetricGroup metrics;
98103

99-
/* The asynchronous http client for the asynchronous Kinesis client */
100-
private final SdkAsyncHttpClient httpClient;
101-
102-
/* The asynchronous Kinesis client - construction is by kinesisClientProperties */
103-
private final KinesisAsyncClient kinesisClient;
104+
/* The client provider */
105+
private KinesisClientProvider kinesisClientProvider;
104106

105107
/* Flag to whether fatally fail any time we encounter an exception when persisting records */
106108
private final boolean failOnError;
@@ -148,6 +150,36 @@ class KinesisStreamsSinkWriter<InputT> extends AsyncSinkWriter<InputT, PutRecord
148150
String streamArn,
149151
Properties kinesisClientProperties,
150152
Collection<BufferedRequestState<PutRecordsRequestEntry>> states) {
153+
this(
154+
elementConverter,
155+
context,
156+
maxBatchSize,
157+
maxInFlightRequests,
158+
maxBufferedRequests,
159+
maxBatchSizeInBytes,
160+
maxTimeInBufferMS,
161+
maxRecordSizeInBytes,
162+
failOnError,
163+
streamName,
164+
streamArn,
165+
states,
166+
createDefaultClientProvider(kinesisClientProperties));
167+
}
168+
169+
KinesisStreamsSinkWriter(
170+
ElementConverter<InputT, PutRecordsRequestEntry> elementConverter,
171+
Sink.InitContext context,
172+
int maxBatchSize,
173+
int maxInFlightRequests,
174+
int maxBufferedRequests,
175+
long maxBatchSizeInBytes,
176+
long maxTimeInBufferMS,
177+
long maxRecordSizeInBytes,
178+
boolean failOnError,
179+
String streamName,
180+
String streamArn,
181+
Collection<BufferedRequestState<PutRecordsRequestEntry>> states,
182+
KinesisClientProvider kinesisClientProvider) {
151183
super(
152184
elementConverter,
153185
context,
@@ -167,11 +199,48 @@ class KinesisStreamsSinkWriter<InputT> extends AsyncSinkWriter<InputT, PutRecord
167199
this.streamArn = streamArn;
168200
this.metrics = context.metricGroup();
169201
this.numRecordsOutErrorsCounter = metrics.getNumRecordsOutErrorsCounter();
170-
this.httpClient = AWSGeneralUtil.createAsyncHttpClient(kinesisClientProperties);
171-
this.kinesisClient = buildClient(kinesisClientProperties, this.httpClient);
202+
setKinesisClientProvider(kinesisClientProvider);
203+
}
204+
205+
/**
206+
* Create a default KinesisClientProvider to manage the Kinesis client and HTTP client.
207+
*
208+
* @param kinesisClientProperties Properties for configuring the Kinesis client
209+
* @return A KinesisClientProvider implementation
210+
*/
211+
private static KinesisClientProvider createDefaultClientProvider(Properties kinesisClientProperties) {
212+
return new KinesisClientProvider() {
213+
private final SdkAsyncHttpClient httpClient =
214+
AWSGeneralUtil.createAsyncHttpClient(kinesisClientProperties);
215+
private final KinesisAsyncClient kinesisClient =
216+
buildClient(kinesisClientProperties, httpClient);
217+
218+
@Override
219+
public KinesisAsyncClient get() {
220+
return kinesisClient;
221+
}
222+
223+
@Override
224+
public void close() {
225+
AWSGeneralUtil.closeResources(httpClient, kinesisClient);
226+
}
227+
};
228+
}
229+
230+
/**
231+
* Set a custom KinesisAsyncClient provider for testing purposes. This method is only intended
232+
* to be used in tests.
233+
*
234+
* @param kinesisClientProvider The provider that supplies the KinesisAsyncClient
235+
*/
236+
@VisibleForTesting
237+
void setKinesisClientProvider(KinesisClientProvider kinesisClientProvider) {
238+
this.kinesisClientProvider =
239+
Preconditions.checkNotNull(
240+
kinesisClientProvider, "The kinesisClientProvider must not be null.");
172241
}
173242

174-
private KinesisAsyncClient buildClient(
243+
private static KinesisAsyncClient buildClient(
175244
Properties kinesisClientProperties, SdkAsyncHttpClient httpClient) {
176245
AWSGeneralUtil.validateAwsCredentials(kinesisClientProperties);
177246

@@ -208,6 +277,7 @@ protected void submitRequestEntries(
208277
.streamARN(streamArn)
209278
.build();
210279

280+
KinesisAsyncClient kinesisClient = kinesisClientProvider.get();
211281
CompletableFuture<PutRecordsResponse> future = kinesisClient.putRecords(batchRequest);
212282

213283
future.whenComplete(
@@ -244,34 +314,121 @@ private void handleFullyFailedRequest(
244314

245315
@Override
246316
public void close() {
247-
AWSGeneralUtil.closeResources(httpClient, kinesisClient);
317+
try {
318+
kinesisClientProvider.close();
319+
} catch (IOException e) {
320+
throw new RuntimeException("Failed to close the kinesisClientProvider", e);
321+
}
248322
}
249323

250324
private void handlePartiallyFailedRequest(
251325
PutRecordsResponse response,
252326
List<PutRecordsRequestEntry> requestEntries,
253327
Consumer<List<PutRecordsRequestEntry>> requestResult) {
254-
LOG.warn(
255-
"KDS Sink failed to write and will retry {} entries to KDS",
256-
response.failedRecordCount());
257-
numRecordsOutErrorsCounter.inc(response.failedRecordCount());
328+
int failedRecordCount = response.failedRecordCount();
329+
LOG.warn("KDS Sink failed to write and will retry {} entries to KDS", failedRecordCount);
330+
numRecordsOutErrorsCounter.inc(failedRecordCount);
258331

259332
if (failOnError) {
260333
getFatalExceptionCons()
261334
.accept(new KinesisStreamsException.KinesisStreamsFailFastException());
262335
return;
263336
}
264-
List<PutRecordsRequestEntry> failedRequestEntries =
265-
new ArrayList<>(response.failedRecordCount());
337+
338+
List<PutRecordsRequestEntry> failedRequestEntries = new ArrayList<>(failedRecordCount);
266339
List<PutRecordsResultEntry> records = response.records();
267340

341+
// Collect error information and build the list of failed entries
342+
Map<String, ErrorSummary> errorSummaries =
343+
collectErrorSummaries(records, requestEntries, failedRequestEntries);
344+
345+
// Log aggregated error information
346+
logErrorSummaries(errorSummaries);
347+
348+
requestResult.accept(failedRequestEntries);
349+
}
350+
351+
/**
352+
* Collect error summaries from failed records and build a list of failed request entries.
353+
*
354+
* @param records The result entries from the Kinesis response
355+
* @param requestEntries The original request entries
356+
* @param failedRequestEntries List to populate with failed entries (modified as a side effect)
357+
* @return A map of error codes to their summaries
358+
*/
359+
private Map<String, ErrorSummary> collectErrorSummaries(
360+
List<PutRecordsResultEntry> records,
361+
List<PutRecordsRequestEntry> requestEntries,
362+
List<PutRecordsRequestEntry> failedRequestEntries) {
363+
364+
// We capture error info while minimizing logging overhead in the data path,
365+
// which is critical for maintaining throughput performance
366+
Map<String, ErrorSummary> errorSummaries = new HashMap<>();
367+
268368
for (int i = 0; i < records.size(); i++) {
269-
if (records.get(i).errorCode() != null) {
369+
PutRecordsResultEntry resultEntry = records.get(i);
370+
String errorCode = resultEntry.errorCode();
371+
372+
if (errorCode != null) {
373+
// Track the frequency of each error code to identify patterns
374+
ErrorSummary summary =
375+
errorSummaries.computeIfAbsent(
376+
errorCode, code -> new ErrorSummary(resultEntry.errorMessage()));
377+
summary.incrementCount();
378+
270379
failedRequestEntries.add(requestEntries.get(i));
271380
}
272381
}
273382

274-
requestResult.accept(failedRequestEntries);
383+
return errorSummaries;
384+
}
385+
386+
/**
387+
* Log aggregated error information at WARN level.
388+
*
389+
* @param errorSummaries Map of error codes to their summaries
390+
*/
391+
private void logErrorSummaries(Map<String, ErrorSummary> errorSummaries) {
392+
// We log aggregated error information at WARN level to ensure visibility in production
393+
// while avoiding the performance impact of logging each individual failure
394+
if (!errorSummaries.isEmpty()) {
395+
StringBuilder errorSummary = new StringBuilder("Kinesis errors summary: ");
396+
errorSummaries.forEach(
397+
(code, summary) ->
398+
errorSummary.append(
399+
String.format(
400+
"[%s: %d records, example: %s] ",
401+
code,
402+
summary.getCount(),
403+
summary.getExampleMessage())));
404+
405+
// Using a single WARN log with aggregated information provides operational
406+
// visibility into errors without flooding logs in high-throughput scenarios
407+
LOG.warn("KDS Sink failed to write, " + errorSummary.toString());
408+
}
409+
}
410+
411+
/** Helper class to store error summary information. */
412+
private static class ErrorSummary {
413+
private final String exampleMessage;
414+
private int count;
415+
416+
ErrorSummary(String exampleMessage) {
417+
this.exampleMessage = exampleMessage;
418+
this.count = 0;
419+
}
420+
421+
void incrementCount() {
422+
count++;
423+
}
424+
425+
int getCount() {
426+
return count;
427+
}
428+
429+
String getExampleMessage() {
430+
return exampleMessage;
431+
}
275432
}
276433

277434
private boolean isRetryable(Throwable err) {

0 commit comments

Comments
 (0)