From 4748c4debb1053988abc70f560267d9f7bbae052 Mon Sep 17 00:00:00 2001 From: gallanik <84850964+gallanik@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:13:17 -0500 Subject: [PATCH 1/5] Adding support for p5.48xlarge and p4de.24xlarge New GPU instance types added to the AWS instance family --- prometheus/prometheus.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml index a9b5197..fb894dc 100755 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml @@ -49,6 +49,8 @@ scrape_configs: - p3.16xlarge - p3dn.24xlarge - p4d.24xlarge + - p4de.24xlarge + - p5.48xlarge - g3s.xlarge - g3.4xlarge - g3.8xlarge From e90d3bb8f1f1e3b022939dfc5994c7e1f25e2f16 Mon Sep 17 00:00:00 2001 From: gallanik <84850964+gallanik@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:20:54 -0500 Subject: [PATCH 2/5] Update FsX parameter name in config With the new releases, the chef-dna file has modified the key for FsX ID. Now it is set to fsx_fs_ids, instead of cfn_fsx_fs_id. Due to this FsX metrics were not captured on the dashboard. This update should fix it. --- parallelcluster-setup/install-monitoring.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parallelcluster-setup/install-monitoring.sh b/parallelcluster-setup/install-monitoring.sh index 7089f5a..d30e9ae 100755 --- a/parallelcluster-setup/install-monitoring.sh +++ b/parallelcluster-setup/install-monitoring.sh @@ -30,7 +30,7 @@ case "${cfn_node_type}" in #cfn_efs=$(cat /etc/chef/dna.json | grep \"cfn_efs\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") #cfn_cluster_cw_logging_enabled=$(cat /etc/chef/dna.json | grep \"cfn_cluster_cw_logging_enabled\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") - cfn_fsx_fs_id=$(cat /etc/chef/dna.json | grep \"cfn_fsx_fs_id\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") + cfn_fsx_fs_id=$(cat /etc/chef/dna.json | grep \"fsx_fs_ids\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") master_instance_id=$(ec2-metadata -i | awk '{print $2}') cfn_max_queue_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue') s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//") From 8836dedd3c6000255164704ec077730143039e4d Mon Sep 17 00:00:00 2001 From: gallanik <84850964+gallanik@users.noreply.github.com> Date: Mon, 15 Jan 2024 00:11:42 -0500 Subject: [PATCH 3/5] Update post-install.sh --- post-install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/post-install.sh b/post-install.sh index 6a907ec..db0358e 100644 --- a/post-install.sh +++ b/post-install.sh @@ -14,7 +14,7 @@ monitoring_dir_name=aws-parallelcluster-monitoring monitoring_tarball="${monitoring_dir_name}.tar.gz" #get GitHub repo to clone and the installation script -monitoring_url=https://github.com/aws-samples/aws-parallelcluster-monitoring/archive/refs/tags/${version}.tar.gz +monitoring_url=https://github.com/gallanik/aws-parallelcluster-monitoring/archive/refs/tags/${version}.tar.gz setup_command=install-monitoring.sh monitoring_home="/home/${cfn_cluster_user}/${monitoring_dir_name}" From 233f9f5cd39dae6b85a5aa6404dc6b926d6a69bb Mon Sep 17 00:00:00 2001 From: gallanik <84850964+gallanik@users.noreply.github.com> Date: Tue, 16 Jan 2024 00:48:18 -0500 Subject: [PATCH 4/5] Update post-install.sh Fixing repo name in the monitoring_url variable --- post-install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/post-install.sh b/post-install.sh index db0358e..6a907ec 100644 --- a/post-install.sh +++ b/post-install.sh @@ -14,7 +14,7 @@ monitoring_dir_name=aws-parallelcluster-monitoring monitoring_tarball="${monitoring_dir_name}.tar.gz" #get GitHub repo to clone and the installation script -monitoring_url=https://github.com/gallanik/aws-parallelcluster-monitoring/archive/refs/tags/${version}.tar.gz +monitoring_url=https://github.com/aws-samples/aws-parallelcluster-monitoring/archive/refs/tags/${version}.tar.gz setup_command=install-monitoring.sh monitoring_home="/home/${cfn_cluster_user}/${monitoring_dir_name}" From 5c27c1aa324c291d04b61f0da895adef5e84ce41 Mon Sep 17 00:00:00 2001 From: gallanik <84850964+gallanik@users.noreply.github.com> Date: Tue, 16 Jan 2024 06:29:10 -0500 Subject: [PATCH 5/5] Update post-install.sh --- post-install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/post-install.sh b/post-install.sh index 6a907ec..db0358e 100644 --- a/post-install.sh +++ b/post-install.sh @@ -14,7 +14,7 @@ monitoring_dir_name=aws-parallelcluster-monitoring monitoring_tarball="${monitoring_dir_name}.tar.gz" #get GitHub repo to clone and the installation script -monitoring_url=https://github.com/aws-samples/aws-parallelcluster-monitoring/archive/refs/tags/${version}.tar.gz +monitoring_url=https://github.com/gallanik/aws-parallelcluster-monitoring/archive/refs/tags/${version}.tar.gz setup_command=install-monitoring.sh monitoring_home="/home/${cfn_cluster_user}/${monitoring_dir_name}"