Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JMX Scraper: YAML file and integration test hadoop #1675

Merged
merged 18 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,63 +46,63 @@ void endToEnd() {
metric,
"hadoop.name_node.capacity.usage",
"The current used capacity across all data nodes reporting to the name node.",
"by",
"By",
attrs -> attrs.contains(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.capacity.limit",
"The total capacity allotted to data nodes reporting to the name node.",
"by",
"By",
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.block.count",
"The total number of blocks on the name node.",
"{blocks}",
"{block}",
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.block.missing",
"The number of blocks reported as missing to the name node.",
"{blocks}",
"{block}",
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.block.corrupt",
"The number of blocks reported as corrupt to the name node.",
"{blocks}",
"{block}",
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.volume.failed",
"The number of failed volumes reported to the name node.",
"{volumes}",
"{volume}",
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.file.count",
"The total number of files being tracked by the name node.",
"{files}",
"{file}",
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.file.load",
"The current number of concurrent file accesses.",
"{operations}",
"{operation}",
attrs -> attrs.containsOnly(entry("node_name", "test-host"))),
metric ->
assertSumWithAttributes(
metric,
"hadoop.name_node.data_node.count",
"The number of data nodes reporting to the name node.",
"{nodes}",
"{node}",
attrs ->
attrs.containsOnly(entry("node_name", "test-host"), entry("state", "live")),
attrs ->
Expand Down
18 changes: 9 additions & 9 deletions jmx-metrics/src/main/resources/target-systems/hadoop.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,31 @@
*/

def beanHadoopNameNodeFS = otel.mbean("Hadoop:service=NameNode,name=FSNamesystem")
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.capacity.usage", "The current used capacity across all data nodes reporting to the name node.", "by",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.capacity.usage", "The current used capacity across all data nodes reporting to the name node.", "By",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"CapacityUsed", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.capacity.limit", "The total capacity allotted to data nodes reporting to the name node.", "by",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.capacity.limit", "The total capacity allotted to data nodes reporting to the name node.", "By",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"CapacityTotal", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.count", "The total number of blocks on the name node.", "{blocks}",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.count", "The total number of blocks on the name node.", "{block}",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"BlocksTotal", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.missing", "The number of blocks reported as missing to the name node.", "{blocks}",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.missing", "The number of blocks reported as missing to the name node.", "{block}",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"MissingBlocks", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.corrupt", "The number of blocks reported as corrupt to the name node.", "{blocks}",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.block.corrupt", "The number of blocks reported as corrupt to the name node.", "{block}",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"CorruptBlocks", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.volume.failed", "The number of failed volumes reported to the name node.", "{volumes}",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.volume.failed", "The number of failed volumes reported to the name node.", "{volume}",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"VolumeFailuresTotal", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.file.count", "The total number of files being tracked by the name node.", "{files}",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.file.count", "The total number of files being tracked by the name node.", "{file}",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"FilesTotal", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.file.load", "The current number of concurrent file accesses.", "{operations}",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.file.load", "The current number of concurrent file accesses.", "{operation}",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
"TotalLoad", otel.&longUpDownCounterCallback)
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.data_node.count", "The number of data nodes reporting to the name node.", "{nodes}",
otel.instrument(beanHadoopNameNodeFS, "hadoop.name_node.data_node.count", "The number of data nodes reporting to the name node.", "{node}",
["node_name" : { mbean -> mbean.getProperty("tag.Hostname") }],
["NumLiveDataNodes":["state":{"live"}], "NumDeadDataNodes": ["state":{"dead"}]],
otel.&longUpDownCounterCallback)
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Copyright The OpenTelemetry Authors
* SPDX-License-Identifier: Apache-2.0
*/

package io.opentelemetry.contrib.jmxscraper.target_systems;

import static io.opentelemetry.contrib.jmxscraper.assertions.DataPointAttributes.attribute;
import static io.opentelemetry.contrib.jmxscraper.assertions.DataPointAttributes.attributeGroup;

import io.opentelemetry.contrib.jmxscraper.JmxScraperContainer;
import io.opentelemetry.contrib.jmxscraper.assertions.AttributeMatcher;
import java.nio.file.Path;
import java.time.Duration;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.containers.wait.strategy.Wait;
import org.testcontainers.utility.MountableFile;

public class HadoopIntegrationTest extends TargetSystemIntegrationTest {

private static final int HADOOP_PORT = 50070;

@Override
protected GenericContainer<?> createTargetContainer(int jmxPort) {
return new GenericContainer<>("bmedora/hadoop:2.9-base")
.withCopyFileToContainer(
MountableFile.forClasspathResource("hadoop-env.sh", 0400),
"/hadoop/etc/hadoop/hadoop-env.sh")
.waitingFor(Wait.forListeningPort().withStartupTimeout(Duration.ofMinutes(2)))
.withExposedPorts(HADOOP_PORT, jmxPort)
.withCreateContainerCmdModifier(cmd -> cmd.withHostName("test-host"))
.waitingFor(Wait.forListeningPorts(HADOOP_PORT, jmxPort));
}

@Override
protected JmxScraperContainer customizeScraperContainer(
JmxScraperContainer scraper, GenericContainer<?> target, Path tempDir) {
return scraper.withTargetSystem("hadoop");
}

@Override
protected MetricsVerifier createMetricsVerifier() {
AttributeMatcher nodeNameAttribute = attribute("node_name", "test-host");
return MetricsVerifier.create()
.add(
"hadoop.name_node.capacity.usage",
metric ->
metric
.hasDescription(
"The current used capacity across all data nodes reporting to the name node.")
.hasUnit("By")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.capacity.limit",
metric ->
metric
.hasDescription(
"The total capacity allotted to data nodes reporting to the name node.")
.hasUnit("By")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.block.count",
metric ->
metric
.hasDescription("The total number of blocks on the name node.")
.hasUnit("{block}")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.block.missing",
metric ->
metric
.hasDescription("The number of blocks reported as missing to the name node.")
.hasUnit("{block}")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.block.corrupt",
metric ->
metric
.hasDescription("The number of blocks reported as corrupt to the name node.")
.hasUnit("{block}")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.volume.failed",
metric ->
metric
.hasDescription("The number of failed volumes reported to the name node.")
.hasUnit("{volume}")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.file.count",
metric ->
metric
.hasDescription("The total number of files being tracked by the name node.")
.hasUnit("{file}")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.file.load",
metric ->
metric
.hasDescription("The current number of concurrent file accesses.")
.hasUnit("{operation}")
.isUpDownCounter()
.hasDataPointsWithOneAttribute(nodeNameAttribute))
.add(
"hadoop.name_node.data_node.count",
metric ->
metric
.hasDescription("The number of data nodes reporting to the name node.")
.hasUnit("{node}")
.isUpDownCounter()
.hasDataPointsWithAttributes(
attributeGroup(nodeNameAttribute, attribute("state", "live")),
attributeGroup(nodeNameAttribute, attribute("state", "dead"))));
}
}
99 changes: 99 additions & 0 deletions jmx-scraper/src/integrationTest/resources/hadoop-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/bin/bash

# Set Hadoop-specific environment variables here.

# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.

# The java implementation to use.
export JAVA_HOME=${JAVA_HOME}

# The jsvc implementation to use. Jsvc is required to run secure datanodes
# that bind to privileged ports to provide authentication of data transfer
# protocol. Jsvc is not required if SASL is configured for authentication of
# data transfer protocol using non-privileged ports.
#export JSVC_HOME=${JSVC_HOME}

export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-"/etc/hadoop"}

# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
for f in "$HADOOP_HOME"/contrib/capacity-scheduler/*.jar; do
if [ "$HADOOP_CLASSPATH" ]; then
export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$f
else
export HADOOP_CLASSPATH=$f
fi
done

# The maximum amount of heap to use, in MB. Default is 1000.
#export HADOOP_HEAPSIZE=
#export HADOOP_NAMENODE_INIT_HEAPSIZE=""

# Enable extra debugging of Hadoop's JAAS binding, used to set up
# Kerberos security.
# export HADOOP_JAAS_DEBUG=true

# Extra Java runtime options. Empty by default.
# For Kerberos debugging, an extended option set logs more invormation
# export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true -Dsun.security.krb5.debug=true -Dsun.security.spnego.debug"
export HADOOP_OPTS="$HADOOP_OPTS -Djava.net.preferIPv4Stack=true"

# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_NAMENODE_OPTS"
export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS"
export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Dcom.sun.management.jmxremote.authenticate=false"
export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Dcom.sun.management.jmxremote.ssl=false"
export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Dcom.sun.management.jmxremote.port=9999 -Dcom.sun.management.jmxremote.rmi.port=9999"
Comment on lines +45 to +48
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[minor] would it be possible to avoid having this script file by just setting the HADOOP_NAMENODE_OPTS with appropriate JMX settings directly ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about all those other env vars too tho?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed with @robsunday yesterday, this looks to be a modified copy of a shell script of the hadoop distribution, which might have been modified to configure the JMX configuration.

The current startup script might allow to easily provide this configuration, but quite often it's not possible to easily override through a single env variable, and sed surgery isn't the easiest thing to maintain.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I tried few ideas how to get rid of this custom script, but without a luck. All tips/guides I've found on the internet recommend modification of hadoop-env.sh (as it is done in our test).
At first glance it looks like passing your own env var HADOOP_NAMENODE_OPTS to docker should work.
Unfortunately, Hadoop launches multiple processes during startup and executes this file many times to setup env variables and the ossue is that initial content of HADOOP_NAMENODE_OPTS passed to docker is somehow lost.
Only hardcoding JMX stuff in hadoop-env.sh worked for me, but if you know about reasonable alternate solution I'll be happy to use it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for trying to remove it, we should be fine to leave it as-is since we don't have a better option.


export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANODE_OPTS"

export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"

export HADOOP_NFS3_OPTS="$HADOOP_NFS3_OPTS"
export HADOOP_PORTMAP_OPTS="-Xmx512m $HADOOP_PORTMAP_OPTS"

# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS"
# set heap args when HADOOP_HEAPSIZE is empty
if [ "$HADOOP_HEAPSIZE" = "" ]; then
export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
fi
#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"

# On secure datanodes, user to run the datanode as after dropping privileges.
# This **MUST** be uncommented to enable secure HDFS if using privileged ports
# to provide authentication of data transfer protocol. This **MUST NOT** be
# defined if SASL is configured for authentication of data transfer protocol
# using non-privileged ports.
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}

# Where log files are stored. $HADOOP_HOME/logs by default.
#export HADOOP_LOG_DIR=${HADOOP_LOG_DIR}/$USER

# Where log files are stored in the secure data environment.
#export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}

###
# HDFS Mover specific parameters
###
# Specify the JVM options to be used when starting the HDFS Mover.
# These options will be appended to the options specified as HADOOP_OPTS
# and therefore may override any similar flags set in HADOOP_OPTS
#
# export HADOOP_MOVER_OPTS=""

###
# Advanced Users Only!
###

# The directory where pid files are stored. /tmp by default.
# NOTE: this should be set to a directory that can only be written to by
# the user that will run the hadoop daemons. Otherwise there is the
# potential for a symlink attack.
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}

# A string representing this instance of hadoop. $USER by default.
export HADOOP_IDENT_STRING=$USER
52 changes: 52 additions & 0 deletions jmx-scraper/src/main/resources/hadoop.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
---
rules:
- bean: Hadoop:service=NameNode,name=FSNamesystem
prefix: hadoop.name_node.
type: updowncounter
metricAttribute:
node_name: beanattr(tag\.Hostname)
mapping:
CapacityUsed:
metric: capacity.usage
unit: By
desc: The current used capacity across all data nodes reporting to the name node.
CapacityTotal:
metric: capacity.limit
unit: By
desc: The total capacity allotted to data nodes reporting to the name node.
BlocksTotal:
metric: block.count
unit: "{block}"
desc: The total number of blocks on the name node.
MissingBlocks:
metric: block.missing
unit: "{block}"
desc: The number of blocks reported as missing to the name node.
CorruptBlocks:
metric: block.corrupt
unit: "{block}"
desc: The number of blocks reported as corrupt to the name node.
VolumeFailuresTotal:
metric: volume.failed
unit: "{volume}"
desc: The number of failed volumes reported to the name node.
FilesTotal:
metric: file.count
unit: "{file}"
desc: The total number of files being tracked by the name node.
TotalLoad:
metric: file.load
unit: "{operation}"
desc: The current number of concurrent file accesses.
NumLiveDataNodes:
metric: &metric data_node.count
unit: &unit "{node}"
desc: &desc The number of data nodes reporting to the name node.
metricAttribute:
state: const(live)
NumDeadDataNodes:
metric: *metric
unit: *unit
desc: *desc
metricAttribute:
state: const(dead)