Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JMX Scraper - YAML config and integration test for HBase #1538

Merged
merged 10 commits into from
Nov 25, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ public class HBaseIntegrationTest extends TargetSystemIntegrationTest {
@Override
protected GenericContainer<?> createTargetContainer(int jmxPort) {
return new GenericContainer<>("dajobe/hbase")
.withEnv("JAVA_HOME", "/usr/lib/jvm/java-8-openjdk-amd64")
.withEnv("HBASE_OPTS", "-XX:+UseConcMarkSweepGC")
.withEnv("HBASE_MASTER_OPTS", genericJmxJvmArguments(jmxPort))
.withStartupTimeout(Duration.ofMinutes(2))
.withExposedPorts(jmxPort, DEFAULT_MASTER_SERVICE_PORT)
Expand Down
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[for reviewer] Some small improvements to report meaningful errors for few failed assertions

Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ static void assertGauge(Metric metric, String name, String description, String u
assertThat(metric.getName()).isEqualTo(name);
assertThat(metric.getDescription()).isEqualTo(description);
assertThat(metric.getUnit()).isEqualTo(unit);
assertThat(metric.hasGauge()).withFailMessage("Invalid metric type").isTrue();
assertMetricWithGauge(metric);
assertThat(metric.getGauge().getDataPointsList())
.satisfiesExactly(point -> assertThat(point.getAttributesList()).isEmpty());
}
Expand All @@ -41,10 +41,7 @@ static void assertSum(
assertThat(metric.getName()).isEqualTo(name);
assertThat(metric.getDescription()).isEqualTo(description);
assertThat(metric.getUnit()).isEqualTo(unit);
assertThat(metric.hasSum()).withFailMessage("Invalid metric type").isTrue();
assertThat(metric.getSum().getIsMonotonic())
.withFailMessage("Metric should " + (isMonotonic ? "" : "not ") + "be monotonic")
.isEqualTo(isMonotonic);
assertMetricWithSum(metric, isMonotonic);
assertThat(metric.getSum().getDataPointsList())
.satisfiesExactly(point -> assertThat(point.getAttributesList()).isEmpty());
}
Expand All @@ -54,7 +51,7 @@ static void assertTypedGauge(
assertThat(metric.getName()).isEqualTo(name);
assertThat(metric.getDescription()).isEqualTo(description);
assertThat(metric.getUnit()).isEqualTo(unit);
assertThat(metric.hasGauge()).withFailMessage("Invalid metric type").isTrue();
assertMetricWithGauge(metric);
assertTypedPoints(metric.getGauge().getDataPointsList(), types);
}

Expand All @@ -63,7 +60,7 @@ static void assertTypedSum(
assertThat(metric.getName()).isEqualTo(name);
assertThat(metric.getDescription()).isEqualTo(description);
assertThat(metric.getUnit()).isEqualTo(unit);
assertThat(metric.hasSum()).withFailMessage("Invalid metric type").isTrue();
assertMetricWithSum(metric);
assertTypedPoints(metric.getSum().getDataPointsList(), types);
}

Expand All @@ -89,10 +86,7 @@ static void assertSumWithAttributes(
assertThat(metric.getName()).isEqualTo(name);
assertThat(metric.getDescription()).isEqualTo(description);
assertThat(metric.getUnit()).isEqualTo(unit);
assertThat(metric.hasSum()).withFailMessage("Invalid metric type").isTrue();
assertThat(metric.getSum().getIsMonotonic())
.withFailMessage("Metric should " + (isMonotonic ? "" : "not ") + "be monotonic")
.isEqualTo(isMonotonic);
assertMetricWithSum(metric, isMonotonic);
assertAttributedPoints(metric.getSum().getDataPointsList(), attributeGroupAssertions);
}

Expand All @@ -107,8 +101,7 @@ static void assertSumWithAttributesMultiplePoints(
assertThat(metric.getName()).isEqualTo(name);
assertThat(metric.getDescription()).isEqualTo(description);
assertThat(metric.getUnit()).isEqualTo(unit);
assertThat(metric.hasSum()).isTrue();
assertThat(metric.getSum().getIsMonotonic()).isEqualTo(isMonotonic);
assertMetricWithSum(metric, isMonotonic);
assertAttributedMultiplePoints(metric.getSum().getDataPointsList(), attributeGroupAssertions);
}

Expand All @@ -122,10 +115,25 @@ static void assertGaugeWithAttributes(
assertThat(metric.getName()).isEqualTo(name);
assertThat(metric.getDescription()).isEqualTo(description);
assertThat(metric.getUnit()).isEqualTo(unit);
assertThat(metric.hasGauge()).withFailMessage("Invalid metric type").isTrue();
assertMetricWithGauge(metric);
assertAttributedPoints(metric.getGauge().getDataPointsList(), attributeGroupAssertions);
}

private static void assertMetricWithGauge(Metric metric) {
assertThat(metric.hasGauge()).withFailMessage("Metric with gauge expected").isTrue();
}

private static void assertMetricWithSum(Metric metric) {
assertThat(metric.hasSum()).withFailMessage("Metric with sum expected").isTrue();
}

private static void assertMetricWithSum(Metric metric, boolean isMonotonic) {
assertMetricWithSum(metric);
assertThat(metric.getSum().getIsMonotonic())
.withFailMessage("Metric should " + (isMonotonic ? "" : "not ") + "be monotonic")
.isEqualTo(isMonotonic);
}

@SuppressWarnings("unchecked")
private static void assertTypedPoints(List<NumberDataPoint> points, List<String> types) {
Consumer<MapAssert<String, String>>[] assertions =
Expand Down
45 changes: 32 additions & 13 deletions jmx-scraper/src/main/resources/hbase.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ rules:
unit: "{server}"
type: updowncounter
mapping:
# Group of properties to build hbase.master.region_server.count metric
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Group of properties to build hbase.master.region_server.count metric
# hbase.master.region_server.count

[nitpicking] Maybe simplify this even further as everything here is a metric (and do the same with others that match Group of properties to build .* metric regex)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

numDeadRegionServers:
metric: &metric region_server.count
desc: &desc The number of region servers.
Expand All @@ -26,9 +27,11 @@ rules:
ritCount:
metric: count
desc: The number of regions that are in transition.

ritCountOverThreshold:
metric: over_threshold
desc: The number of regions that have been in transition longer than a threshold time.

ritOldestAge:
metric: oldest_age
unit: ms
Expand Down Expand Up @@ -61,6 +64,19 @@ rules:
unit: "{log}"
desc: The number of write ahead logs not yet archived.

percentFilesLocal:
metric: files.local
type: gauge
unit: "%"
desc: Percent of store file data that can be read from the local.

updatesBlockedTime:
metric: blocked_update.time
type: gauge
unit: ms
desc: Amount of time updates have been blocked so the memstore can be flushed.

# Group of properties to build hbase.region_server.request.count metric
writeRequestCount:
metric: &metric request.count
unit: &unit "{request}"
Expand All @@ -76,6 +92,7 @@ rules:
state: const(read)
region_server: *hostname

# Group of properties to build hbase.region_server.queue.length metric
flushQueueLength:
metric: &metric queue.length
unit: &unit "{handler}"
Expand All @@ -91,12 +108,7 @@ rules:
state: const(compaction)
region_server: *hostname

updatesBlockedTime:
metric: blocked_update.time
type: gauge
unit: ms
desc: Amount of time updates have been blocked so the memstore can be flushed.

# Group of properties to build hbase.region_server.block_cache.operation.count metric
blockCacheMissCount:
metric: &metric block_cache.operation.count
type: &type gauge
Expand All @@ -114,19 +126,14 @@ rules:
state: const(hit)
region_server: *hostname

percentFilesLocal:
metric: files.local
type: gauge
unit: "%"
desc: Percent of store file data that can be read from the local.

# Group of properties to build hbase.region_server.operations.slow metric
slowDeleteCount:
metric: &metric operations.slow
unit: &unit "{operation}"
desc: &desc Number of operations that took over 1000ms to complete.
metricAttribute:
operation: const(delete)
region_server: &hostname beanattr(tag\.Hostname)
region_server: *hostname
slowAppendCount:
metric: *metric
unit: *unit
Expand Down Expand Up @@ -156,13 +163,15 @@ rules:
operation: const(increment)
region_server: *hostname

# RegionServer statistical metrics
- bean: Hadoop:service=HBase,name=RegionServer,sub=Server
prefix: hbase.region_server.
type: gauge
unit: ms
metricAttribute:
region_server: *hostname
mapping:
# Statistics for 'append' operation
Append_99th_percentile:
metric: operation.append.latency.p99
desc: Append operation 99th Percentile latency.
Expand All @@ -179,6 +188,7 @@ rules:
metric: operation.append.latency.median
desc: Append operation median latency.

# Statistics for 'delete' operation
Delete_99th_percentile:
metric: operation.delete.latency.p99
desc: Delete operation 99th Percentile latency.
Expand All @@ -195,6 +205,7 @@ rules:
metric: operation.delete.latency.median
desc: Delete operation median latency.

# Statistics for 'put' operation
Put_99th_percentile:
metric: operation.put.latency.p99
desc: Put operation 99th Percentile latency.
Expand All @@ -211,6 +222,7 @@ rules:
metric: operation.put.latency.median
desc: Put operation median latency.

# Statistics for 'get' operation
Get_99th_percentile:
metric: operation.get.latency.p99
desc: Get operation 99th Percentile latency.
Expand All @@ -227,6 +239,7 @@ rules:
metric: operation.get.latency.median
desc: Get operation median latency.

# Statistics for 'replay' operation
Replay_99th_percentile:
metric: operation.replay.latency.p99
desc: Replay operation 99th Percentile latency.
Expand All @@ -243,6 +256,7 @@ rules:
metric: operation.replay.latency.median
desc: Replay operation median latency.

# Statistics for 'increment' operation
Increment_99th_percentile:
metric: operation.increment.latency.p99
desc: Increment operation 99th Percentile latency.
Expand All @@ -269,11 +283,13 @@ rules:
metric: open_connection.count
unit: "{connection}"
desc: The number of open connections at the RPC layer.

numActiveHandler:
metric: active_handler.count
unit: "{handler}"
desc: The number of RPC handlers actively servicing requests.

# Group of properties to build hbase.region_server.queue.request.count metric
numCallsInReplicationQueue:
metric: &metric queue.request.count
unit: &unit "{request}"
Expand All @@ -296,6 +312,7 @@ rules:
state: const(priority)
region_server: *hostname

# Group of properties to build hbase.region_server.authentication.count metric
authenticationSuccesses:
metric: &metric authentication.count
unit: &unit "{authentication request}"
Expand All @@ -321,9 +338,11 @@ rules:
GcTimeMillis:
metric: time
desc: Time spent in garbage collection.

GcTimeMillisParNew:
metric: young_gen.time
desc: Time spent in garbage collection of the young generation.

GcTimeMillisConcurrentMarkSweep:
metric: old_gen.time
desc: Time spent in garbage collection of the old generation.
Loading