Skip to content

Commit c2d4d62

Browse files
[Improve] state/elasticache-redis - Improve alerting (widdix#463)
1 parent ee192fd commit c2d4d62

File tree

1 file changed

+132
-24
lines changed

1 file changed

+132
-24
lines changed

Diff for: state/elasticache-redis.yaml

+132-24
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,14 @@ Metadata:
3838
- SubDomainName
3939
- NumShards
4040
- NumReplicas
41+
- Label:
42+
default: 'Alerting Parameters'
43+
Parameters:
44+
- CPUUtilizationThreshold
45+
- DatabaseMemoryUsagePercentageThreshold
46+
- SwapUsageThreshold
47+
- EvictionsThreshold
48+
- ReplicationLagThreshold
4149
Parameters:
4250
ParentVPCStack:
4351
Description: 'Stack name of parent VPC stack based on vpc/vpc-*azs.yaml template.'
@@ -112,6 +120,33 @@ Parameters:
112120
Default: 1
113121
MinValue: 0
114122
MaxValue: 5
123+
CPUUtilizationThreshold:
124+
Description: 'The maximum percentage of CPU usage (set to -1 to disable).'
125+
Type: Number
126+
Default: 80
127+
MinValue: -1
128+
MaxValue: 100
129+
DatabaseMemoryUsagePercentageThreshold:
130+
Description: 'The maximum percentage of memory usage (set to -1 to disable).'
131+
Type: Number
132+
Default: 90
133+
MinValue: -1
134+
MaxValue: 100
135+
SwapUsageThreshold:
136+
Description: 'The maximum bytes of swap usage (set to -1 to disable).'
137+
Type: Number
138+
Default: 67108864 # 64 MB in Bytes
139+
MinValue: -1
140+
EvictionsThreshold:
141+
Description: 'The maximum number of evictions (set to -1 to disable).'
142+
Type: Number
143+
Default: 1000
144+
MinValue: -1
145+
ReplicationLagThreshold:
146+
Description: 'The maximum seconds of replication lag (set to -1 to disable).'
147+
Type: Number
148+
Default: 30
149+
MinValue: -1
115150
Mappings:
116151
EngineVersionMap:
117152
'3.2.6':
@@ -130,6 +165,11 @@ Conditions:
130165
HasAuthToken: !Not [!Equals [!Ref AuthToken, '']]
131166
HasSnapshotName: !Not [!Equals [!Ref SnapshotName, '']]
132167
HasAutomaticFailoverEnabled: !Not [!Equals [!Ref NumReplicas, 0]]
168+
HasCPUUtilizationThresholdAndAlertTopic: !And [!Not [!Equals [!Ref CPUUtilizationThreshold, '']], !Condition HasAlertTopic]
169+
HasDatabaseMemoryUsagePercentageThresholdAndAlertTopic: !And [!Not [!Equals [!Ref DatabaseMemoryUsagePercentageThreshold, '']], !Condition HasAlertTopic]
170+
HasSwapUsageThresholdAndAlertTopic: !And [!Not [!Equals [!Ref SwapUsageThreshold, '']], !Condition HasAlertTopic]
171+
HasEvictionsThresholdAndAlertTopic: !And [!Not [!Equals [!Ref EvictionsThreshold, '']], !Condition HasAlertTopic]
172+
HasReplicationLagThresholdAndAlertTopic: !And [!Not [!Equals [!Ref ReplicationLagThreshold, '']], !Condition HasAlertTopic]
133173
Resources:
134174
RecordSet:
135175
Condition: HasZone
@@ -209,136 +249,204 @@ Resources:
209249
UpdatePolicy:
210250
UseOnlineResharding: true
211251
Node1CPUUtilizationTooHighAlarm:
212-
Condition: HasAlertTopic
252+
Condition: HasCPUUtilizationThresholdAndAlertTopic
213253
Type: 'AWS::CloudWatch::Alarm'
214254
Properties:
215-
AlarmDescription: 'Average CPU utilization over last 10 minutes higher than 80%'
255+
AlarmDescription: !Sub 'Average CPU utilization over last 10 minutes higher than ${CPUUtilizationThreshold}%'
216256
Namespace: 'AWS/ElastiCache'
217257
MetricName: CPUUtilization
218258
Statistic: Average
219259
Period: 600
220260
EvaluationPeriods: 1
221261
ComparisonOperator: GreaterThanThreshold
222-
Threshold: 80
262+
Threshold: !Ref CPUUtilizationThreshold
223263
AlarmActions:
224264
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
225265
Dimensions:
226266
- Name: CacheClusterId
227267
Value: !Sub '${ReplicationGroup}-001'
228268
Node2CPUUtilizationTooHighAlarm:
229-
Condition: HasAlertTopic
269+
Condition: HasCPUUtilizationThresholdAndAlertTopic
230270
Type: 'AWS::CloudWatch::Alarm'
231271
Properties:
232-
AlarmDescription: 'Average CPU utilization over last 10 minutes higher than 80%'
272+
AlarmDescription: !Sub 'Average CPU utilization over last 10 minutes higher than ${CPUUtilizationThreshold}%'
233273
Namespace: 'AWS/ElastiCache'
234274
MetricName: CPUUtilization
235275
Statistic: Average
236276
Period: 600
237277
EvaluationPeriods: 1
238278
ComparisonOperator: GreaterThanThreshold
239-
Threshold: 80
279+
Threshold: !Ref CPUUtilizationThreshold
280+
AlarmActions:
281+
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
282+
Dimensions:
283+
- Name: CacheClusterId
284+
Value: !Sub '${ReplicationGroup}-002'
285+
Node1EngineCPUUtilizationTooHighAlarm:
286+
Condition: HasCPUUtilizationThresholdAndAlertTopic
287+
Type: 'AWS::CloudWatch::Alarm'
288+
Properties:
289+
AlarmDescription: !Sub 'Average engine CPU utilization over last 10 minutes higher than ${CPUUtilizationThreshold}%'
290+
Namespace: 'AWS/ElastiCache'
291+
MetricName: EngineCPUUtilization
292+
Statistic: Average
293+
Period: 600
294+
EvaluationPeriods: 1
295+
ComparisonOperator: GreaterThanThreshold
296+
Threshold: !Ref CPUUtilizationThreshold
297+
AlarmActions:
298+
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
299+
Dimensions:
300+
- Name: CacheClusterId
301+
Value: !Sub '${ReplicationGroup}-001'
302+
Node2EngineCPUUtilizationTooHighAlarm:
303+
Condition: HasCPUUtilizationThresholdAndAlertTopic
304+
Type: 'AWS::CloudWatch::Alarm'
305+
Properties:
306+
AlarmDescription: !Sub 'Average engine CPU utilization over last 10 minutes higher than ${CPUUtilizationThreshold}%'
307+
Namespace: 'AWS/ElastiCache'
308+
MetricName: EngineCPUUtilization
309+
Statistic: Average
310+
Period: 600
311+
EvaluationPeriods: 1
312+
ComparisonOperator: GreaterThanThreshold
313+
Threshold: !Ref CPUUtilizationThreshold
314+
AlarmActions:
315+
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
316+
Dimensions:
317+
- Name: CacheClusterId
318+
Value: !Sub '${ReplicationGroup}-002'
319+
Node1DatabaseMemoryUsagePercentageTooHighAlarm:
320+
Condition: HasDatabaseMemoryUsagePercentageThresholdAndAlertTopic
321+
Type: 'AWS::CloudWatch::Alarm'
322+
Properties:
323+
AlarmDescription: !Sub 'Average memory usage over last 10 minutes higher than ${DatabaseMemoryUsagePercentageThreshold}, performance may suffer'
324+
Namespace: 'AWS/ElastiCache'
325+
MetricName: DatabaseMemoryUsagePercentage
326+
Statistic: Average
327+
Period: 600
328+
EvaluationPeriods: 1
329+
ComparisonOperator: GreaterThanThreshold
330+
Threshold: !Ref DatabaseMemoryUsagePercentageThreshold
331+
AlarmActions:
332+
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
333+
Dimensions:
334+
- Name: CacheClusterId
335+
Value: !Sub '${ReplicationGroup}-001'
336+
Node2DatabaseMemoryUsagePercentageTooHighAlarm:
337+
Condition: HasDatabaseMemoryUsagePercentageThresholdAndAlertTopic
338+
Type: 'AWS::CloudWatch::Alarm'
339+
Properties:
340+
AlarmDescription: !Sub 'Average memory usage over last 10 minutes higher than ${DatabaseMemoryUsagePercentageThreshold}, performance may suffer'
341+
Namespace: 'AWS/ElastiCache'
342+
MetricName: DatabaseMemoryUsagePercentage
343+
Statistic: Average
344+
Period: 600
345+
EvaluationPeriods: 1
346+
ComparisonOperator: GreaterThanThreshold
347+
Threshold: !Ref DatabaseMemoryUsagePercentageThreshold
240348
AlarmActions:
241349
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
242350
Dimensions:
243351
- Name: CacheClusterId
244352
Value: !Sub '${ReplicationGroup}-002'
245353
Node1SwapUsageTooHighAlarm:
246-
Condition: HasAlertTopic
354+
Condition: HasSwapUsageThresholdAndAlertTopic
247355
Type: 'AWS::CloudWatch::Alarm'
248356
Properties:
249-
AlarmDescription: 'Average swap usage over last 10 minutes higher than 64 MB, performance may suffer'
357+
AlarmDescription: !Sub 'Average swap usage over last 10 minutes higher than ${SwapUsageThreshold} bytes, performance may suffer'
250358
Namespace: 'AWS/ElastiCache'
251359
MetricName: SwapUsage
252360
Statistic: Average
253361
Period: 600
254362
EvaluationPeriods: 1
255363
ComparisonOperator: GreaterThanThreshold
256-
Threshold: 67108864 # 64 MB in Bytes
364+
Threshold: !Ref SwapUsageThreshold
257365
AlarmActions:
258366
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
259367
Dimensions:
260368
- Name: CacheClusterId
261369
Value: !Sub '${ReplicationGroup}-001'
262370
Node2SwapUsageTooHighAlarm:
263-
Condition: HasAlertTopic
371+
Condition: HasSwapUsageThresholdAndAlertTopic
264372
Type: 'AWS::CloudWatch::Alarm'
265373
Properties:
266-
AlarmDescription: 'Average swap usage over last 10 minutes higher than 64 MB, performance may suffer'
374+
AlarmDescription: !Sub 'Average swap usage over last 10 minutes higher than ${SwapUsageThreshold} bytes, performance may suffer'
267375
Namespace: 'AWS/ElastiCache'
268376
MetricName: SwapUsage
269377
Statistic: Average
270378
Period: 600
271379
EvaluationPeriods: 1
272380
ComparisonOperator: GreaterThanThreshold
273-
Threshold: 67108864 # 64 MB in Bytes
381+
Threshold: !Ref SwapUsageThreshold
274382
AlarmActions:
275383
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
276384
Dimensions:
277385
- Name: CacheClusterId
278386
Value: !Sub '${ReplicationGroup}-002'
279387
Node1EvictionsTooHighAlarm:
280-
Condition: HasAlertTopic
388+
Condition: HasEvictionsThresholdAndAlertTopic
281389
Type: 'AWS::CloudWatch::Alarm'
282390
Properties:
283-
AlarmDescription: 'Average evictions over last 10 minutes higher than 1000, may enough memory for all keys'
391+
AlarmDescription: !Sub 'Average evictions over last 10 minutes higher than ${EvictionsThreshold}, cache hit ratio may suffer'
284392
Namespace: 'AWS/ElastiCache'
285393
MetricName: Evictions
286394
Statistic: Average
287395
Period: 600
288396
EvaluationPeriods: 1
289397
ComparisonOperator: GreaterThanThreshold
290-
Threshold: 1000
398+
Threshold: !Ref EvictionsThreshold
291399
AlarmActions:
292400
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
293401
Dimensions:
294402
- Name: CacheClusterId
295403
Value: !Sub '${ReplicationGroup}-001'
296404
Node2EvictionsTooHighAlarm:
297-
Condition: HasAlertTopic
405+
Condition: HasEvictionsThresholdAndAlertTopic
298406
Type: 'AWS::CloudWatch::Alarm'
299407
Properties:
300-
AlarmDescription: 'Average evictions over last 10 minutes higher than 1000, may enough memory for all keys'
408+
AlarmDescription: !Sub 'Average evictions over last 10 minutes higher than ${EvictionsThreshold}, cache hit ratio may suffer'
301409
Namespace: 'AWS/ElastiCache'
302410
MetricName: Evictions
303411
Statistic: Average
304412
Period: 600
305413
EvaluationPeriods: 1
306414
ComparisonOperator: GreaterThanThreshold
307-
Threshold: 1000
415+
Threshold: !Ref EvictionsThreshold
308416
AlarmActions:
309417
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
310418
Dimensions:
311419
- Name: CacheClusterId
312420
Value: !Sub '${ReplicationGroup}-002'
313421
Node1ReplicationLagTooHighAlarm:
314-
Condition: HasAlertTopic
422+
Condition: HasReplicationLagThresholdAndAlertTopic
315423
Type: 'AWS::CloudWatch::Alarm'
316424
Properties:
317-
AlarmDescription: 'Average replication lag over last 10 minutes higher than 30 seconds'
425+
AlarmDescription: !Sub 'Average replication lag over last 10 minutes higher than ${ReplicationLagThreshold} seconds'
318426
Namespace: 'AWS/ElastiCache'
319427
MetricName: ReplicationLag
320428
Statistic: Average
321429
Period: 600
322430
EvaluationPeriods: 1
323431
ComparisonOperator: GreaterThanThreshold
324-
Threshold: 30
432+
Threshold: !Ref ReplicationLagThreshold
325433
AlarmActions:
326434
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
327435
Dimensions:
328436
- Name: CacheClusterId
329437
Value: !Sub '${ReplicationGroup}-001'
330438
Node2ReplicationLagTooHighAlarm:
331-
Condition: HasAlertTopic
439+
Condition: HasReplicationLagThresholdAndAlertTopic
332440
Type: 'AWS::CloudWatch::Alarm'
333441
Properties:
334-
AlarmDescription: 'Average replication lag over last 10 minutes higher than 30 seconds'
442+
AlarmDescription: !Sub 'Average replication lag over last 10 minutes higher than ${ReplicationLagThreshold} seconds'
335443
Namespace: 'AWS/ElastiCache'
336444
MetricName: ReplicationLag
337445
Statistic: Average
338446
Period: 600
339447
EvaluationPeriods: 1
340448
ComparisonOperator: GreaterThanThreshold
341-
Threshold: 30
449+
Threshold: !Ref ReplicationLagThreshold
342450
AlarmActions:
343451
- 'Fn::ImportValue': !Sub '${ParentAlertStack}-TopicARN'
344452
Dimensions:

0 commit comments

Comments
 (0)