2020 if [ -z " ${ARTIFACT_DIR:- } " ]; then
2121 export ARTIFACT_DIR=" /tmp/nvidia-gpu-operator_$( date +%Y%m%d_%H%M) "
2222 fi
23- echo " Using ARTIFACT_DIR=$ARTIFACT_DIR "
23+ echo " Using ARTIFACT_DIR=${ ARTIFACT_DIR} "
2424fi
2525
26- mkdir -p " $ARTIFACT_DIR "
26+ mkdir -p " ${ ARTIFACT_DIR} "
2727
2828echo
2929
30- exec 1> >( tee $ ARTIFACT_DIR /must-gather.log)
31- exec 2> $ ARTIFACT_DIR /must-gather.stderr.log
30+ exec 1> >( tee " ${ ARTIFACT_DIR} /must-gather.log" )
31+ exec 2> " ${ ARTIFACT_DIR} /must-gather.stderr.log"
3232
3333if [[ " $0 " == " /usr/bin/gather" ]]; then
34- echo " NVIDIA GPU Operator" > $ ARTIFACT_DIR /version
35- echo " ${VERSION:- N/ A} " >> $ ARTIFACT_DIR /version
34+ echo " NVIDIA GPU Operator" > " ${ ARTIFACT_DIR} /version"
35+ echo " ${VERSION:- N/ A} " >> " ${ ARTIFACT_DIR} /version"
3636fi
3737
3838ocp_cluster=$( $K get clusterversion/version --ignore-not-found -oname || true)
3939
4040if [[ " $ocp_cluster " ]]; then
4141 echo " Running in OpenShift."
4242 echo " Get the cluster version"
43- $K get clusterversion/version -oyaml > $ ARTIFACT_DIR /openshift_version.yaml
43+ $K get clusterversion/version -oyaml > " ${ ARTIFACT_DIR} /openshift_version.yaml"
4444fi
4545
4646echo
@@ -66,7 +66,7 @@ if [ -z "$OPERATOR_POD_NAME" ]; then
6666 exit 1
6767fi
6868
69- OPERATOR_NAMESPACE=$( $K get pods -lapp=gpu-operator -A -ojsonpath={.items[].metadata.namespace} --ignore-not-found)
69+ OPERATOR_NAMESPACE=$( $K get pods -lapp=gpu-operator -A -ojsonpath=' {.items[].metadata.namespace}' --ignore-not-found)
7070
7171echo " Using '$OPERATOR_NAMESPACE ' as operator namespace"
7272echo " "
9393
9494CLUSTER_POLICY_NAME=$( $K get clusterpolicy -oname)
9595
96- if [[ " $CLUSTER_POLICY_NAME " ]]; then
97- echo " Get $CLUSTER_POLICY_NAME "
98- $K get -oyaml $ CLUSTER_POLICY_NAME > $ ARTIFACT_DIR /cluster_policy.yaml
96+ if [[ " ${ CLUSTER_POLICY_NAME} " ]]; then
97+ echo " Get ${ CLUSTER_POLICY_NAME} "
98+ $K get -oyaml " ${ CLUSTER_POLICY_NAME} " > " ${ ARTIFACT_DIR} /cluster_policy.yaml"
9999else
100100 echo " Mark the ClusterPolicy as missing"
101- touch $ ARTIFACT_DIR /cluster_policy.missing
101+ touch " ${ ARTIFACT_DIR} /cluster_policy.missing"
102102fi
103103
104104echo
@@ -109,15 +109,15 @@ echo
109109
110110if [ " $ocp_cluster " ]; then
111111 echo " Get all the machines"
112- $K get machines -A > $ ARTIFACT_DIR /all_machines.list
112+ $K get machines -A > " ${ ARTIFACT_DIR} /all_machines.list"
113113fi
114114
115115echo " Get the labels of the nodes with NVIDIA PCI cards"
116116
117117GPU_PCI_LABELS=(feature.node.kubernetes.io/pci-10de.present feature.node.kubernetes.io/pci-0302_10de.present feature.node.kubernetes.io/pci-0300_10de.present)
118118
119119gpu_pci_nodes=" "
120- for label in ${GPU_PCI_LABELS[@]} ; do
120+ for label in " ${GPU_PCI_LABELS[@]} " ; do
121121 gpu_pci_nodes=" $gpu_pci_nodes $( $K get nodes -l$label -oname) "
122122done
123123
@@ -127,23 +127,23 @@ if [ -z "$gpu_pci_nodes" ]; then
127127fi
128128
129129for node in $( echo " $gpu_pci_nodes " ) ; do
130- echo " $node " | cut -d/ -f2 >> $ ARTIFACT_DIR /gpu_nodes.labels
131- $K get $ node ' -ojsonpath={.metadata.labels}' \
130+ echo " ${ node} " | cut -d/ -f2 >> " ${ ARTIFACT_DIR} /gpu_nodes.labels"
131+ $K get " ${ node} " ' -ojsonpath={.metadata.labels}' \
132132 | sed ' s|,|,- |g' \
133133 | tr ' ,' ' \n' \
134134 | sed ' s/{"/- /' \
135135 | tr : = \
136136 | sed ' s/"//g' \
137137 | sed ' s/}/\n/' \
138- >> $ ARTIFACT_DIR /gpu_nodes.labels
139- echo " " >> $ ARTIFACT_DIR /gpu_nodes.labels
138+ >> " ${ ARTIFACT_DIR} /gpu_nodes.labels"
139+ echo " " >> " ${ ARTIFACT_DIR} /gpu_nodes.labels"
140140done
141141
142142echo " Get the GPU nodes (status)"
143- $K get nodes -l nvidia.com/gpu.present=true -o wide > $ ARTIFACT_DIR /gpu_nodes.status
143+ $K get nodes -l nvidia.com/gpu.present=true -o wide > " ${ ARTIFACT_DIR} /gpu_nodes.status"
144144
145145echo " Get the GPU nodes (description)"
146- $K describe nodes -l nvidia.com/gpu.present=true > $ ARTIFACT_DIR /gpu_nodes.descr
146+ $K describe nodes -l nvidia.com/gpu.present=true > " ${ ARTIFACT_DIR} /gpu_nodes.descr"
147147
148148echo " "
149149echo " #"
@@ -152,77 +152,77 @@ echo "#"
152152echo
153153
154154echo " Get the GPU Operator Pod (status)"
155- $K get $ OPERATOR_POD_NAME \
155+ $K get " ${ OPERATOR_POD_NAME} " \
156156 -owide \
157- -n $ OPERATOR_NAMESPACE \
158- > $ ARTIFACT_DIR /gpu_operator_pod.status
157+ -n " ${ OPERATOR_NAMESPACE} " \
158+ > " ${ ARTIFACT_DIR} /gpu_operator_pod.status"
159159
160160echo " Get the GPU Operator Pod (yaml)"
161- $K get $ OPERATOR_POD_NAME \
161+ $K get " ${ OPERATOR_POD_NAME} " \
162162 -oyaml \
163- -n $ OPERATOR_NAMESPACE \
164- > $ ARTIFACT_DIR /gpu_operator_pod.yaml
163+ -n " ${ OPERATOR_NAMESPACE} " \
164+ > " ${ ARTIFACT_DIR} /gpu_operator_pod.yaml"
165165
166166echo " Get the GPU Operator Pod logs"
167- $K logs $ OPERATOR_POD_NAME \
168- -n $ OPERATOR_NAMESPACE \
169- > " $ARTIFACT_DIR /gpu_operator_pod.log"
167+ $K logs " ${ OPERATOR_POD_NAME} " \
168+ -n " ${ OPERATOR_NAMESPACE} " \
169+ > " ${ ARTIFACT_DIR} /gpu_operator_pod.log"
170170
171- $K logs $ OPERATOR_POD_NAME \
172- -n $ OPERATOR_NAMESPACE \
171+ $K logs " ${ OPERATOR_POD_NAME} " \
172+ -n " ${ OPERATOR_NAMESPACE} " \
173173 --previous \
174- > " $ARTIFACT_DIR /gpu_operator_pod.previous.log"
174+ > " ${ ARTIFACT_DIR} /gpu_operator_pod.previous.log"
175175
176176echo " "
177177echo " #"
178178echo " # Operand Pods"
179179echo " #"
180180echo " "
181181
182- echo " Get the Pods in $OPERATOR_NAMESPACE (status)"
182+ echo " Get the Pods in ${ OPERATOR_NAMESPACE} (status)"
183183$K get pods -owide \
184- -n $ OPERATOR_NAMESPACE \
185- > $ ARTIFACT_DIR /gpu_operand_pods.status
184+ -n " ${ OPERATOR_NAMESPACE} " \
185+ > " ${ ARTIFACT_DIR} /gpu_operand_pods.status"
186186
187- echo " Get the Pods in $OPERATOR_NAMESPACE (yaml)"
187+ echo " Get the Pods in ${ OPERATOR_NAMESPACE} (yaml)"
188188$K get pods -oyaml \
189- -n $ OPERATOR_NAMESPACE \
190- > $ ARTIFACT_DIR /gpu_operand_pods.yaml
189+ -n " ${ OPERATOR_NAMESPACE} " \
190+ > " ${ ARTIFACT_DIR} /gpu_operand_pods.yaml"
191191
192192echo " Get the GPU Operator Pods Images"
193- $K get pods -n $ OPERATOR_NAMESPACE \
193+ $K get pods -n " ${ OPERATOR_NAMESPACE} " \
194194 -o=jsonpath=' {range .items[*]}{"\n"}{.metadata.name}{":\t"}{range .spec.containers[*]}{.image}{" "}{end}{end}' \
195- > $ ARTIFACT_DIR /gpu_operand_pod_images.txt
195+ > " ${ ARTIFACT_DIR} /gpu_operand_pod_images.txt"
196196
197197echo " Get the description and logs of the GPU Operator Pods"
198198
199- for pod in $( $K get pods -n $ OPERATOR_NAMESPACE -oname) ;
199+ for pod in $( $K get pods -n " ${ OPERATOR_NAMESPACE} " -oname) ;
200200do
201- if ! $K get $ pod -n $ OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet ' (nvidia|gpu)' ; then
201+ if ! $K get " ${ pod} " -n " ${ OPERATOR_NAMESPACE} " -ojsonpath=' {.metadata.labels}' | grep -E --quiet ' (nvidia|gpu)' ; then
202202 echo " Skipping $pod , not a NVIDA/GPU Pod ..."
203203 continue
204204 fi
205205 pod_name=$( echo " $pod " | cut -d/ -f2)
206206
207- if [ $ pod == $ OPERATOR_POD_NAME ]; then
207+ if [ " ${ pod} " == " ${ OPERATOR_POD_NAME} " ]; then
208208 echo " Skipping operator pod $pod_name ..."
209209 continue
210210 fi
211211
212- $K logs $ pod \
213- -n $ OPERATOR_NAMESPACE \
212+ $K logs " ${ pod} " \
213+ -n " ${ OPERATOR_NAMESPACE} " \
214214 --all-containers --prefix \
215- > $ ARTIFACT_DIR /gpu_operand_pod_$pod_name .log
215+ > " ${ ARTIFACT_DIR} /gpu_operand_pod_$pod_name .log"
216216
217- $K logs $ pod \
218- -n $ OPERATOR_NAMESPACE \
217+ $K logs " ${ pod} " \
218+ -n " ${ OPERATOR_NAMESPACE} " \
219219 --all-containers --prefix \
220220 --previous \
221- > $ ARTIFACT_DIR /gpu_operand_pod_$pod_name .previous.log
221+ > " ${ ARTIFACT_DIR} /gpu_operand_pod_$pod_name .previous.log"
222222
223- $K describe $ pod \
224- -n $ OPERATOR_NAMESPACE \
225- > $ ARTIFACT_DIR /gpu_operand_pod_$pod_name .descr
223+ $K describe " ${ pod} " \
224+ -n " ${ OPERATOR_NAMESPACE} " \
225+ > " ${ ARTIFACT_DIR} /gpu_operand_pod_$pod_name .descr"
226226done
227227
228228echo " "
@@ -234,27 +234,26 @@ echo ""
234234echo " Get the DaemonSets in $OPERATOR_NAMESPACE (status)"
235235
236236$K get ds \
237- -n $OPERATOR_NAMESPACE \
238- > $ARTIFACT_DIR /gpu_operand_ds.status
239-
237+ -n " ${OPERATOR_NAMESPACE} " \
238+ > " ${ARTIFACT_DIR} /gpu_operand_ds.status"
240239
241240echo " Get the DaemonSets in $OPERATOR_NAMESPACE (yaml)"
242241
243242$K get ds -oyaml \
244- -n $ OPERATOR_NAMESPACE \
245- > $ ARTIFACT_DIR /gpu_operand_ds.yaml
243+ -n " ${ OPERATOR_NAMESPACE} " \
244+ > " ${ ARTIFACT_DIR} /gpu_operand_ds.yaml"
246245
247246echo " Get the description of the GPU Operator DaemonSets"
248247
249- for ds in $( $K get ds -n $ OPERATOR_NAMESPACE -oname) ;
248+ for ds in $( $K get ds -n " ${ OPERATOR_NAMESPACE} " -oname) ;
250249do
251- if ! $K get $ds -n $ OPERATOR_NAMESPACE -ojsonpath={.metadata.labels} | egrep --quiet ' (nvidia|gpu)' ; then
252- echo " Skipping $ds , not a NVIDA/GPU DaemonSet ..."
250+ if ! $K get " ${ds} " -n " ${ OPERATOR_NAMESPACE} " -ojsonpath=' {.metadata.labels}' | grep -E --quiet ' (nvidia|gpu)' ; then
251+ echo " Skipping ${ds} , not a NVIDA/GPU DaemonSet ..."
253252 continue
254253 fi
255- $K describe $ds \
256- -n $ OPERATOR_NAMESPACE \
257- > $ ARTIFACT_DIR /gpu_operand_ds_$( echo " $ds " | cut -d/ -f2) .descr
254+ $K describe " ${ds} " \
255+ -n " ${ OPERATOR_NAMESPACE} " \
256+ > " ${ ARTIFACT_DIR} /gpu_operand_ds_$( echo " $ds " | cut -d/ -f2) .descr"
258257done
259258
260259echo " "
@@ -263,18 +262,18 @@ echo "# nvidia-bug-report.sh"
263262echo " #"
264263echo " "
265264
266- for pod in $( $K get pods -lopenshift.driver-toolkit -oname -n $ OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-driver-daemonset -oname -n $ OPERATOR_NAMESPACE; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n $ OPERATOR_NAMESPACE) ;
265+ for pod in $( $K get pods -lopenshift.driver-toolkit -oname -n " ${ OPERATOR_NAMESPACE} " ; $K get pods -lapp=nvidia-driver-daemonset -oname -n " ${ OPERATOR_NAMESPACE} " ; $K get pods -lapp=nvidia-vgpu-manager-daemonset -oname -n " ${ OPERATOR_NAMESPACE} " ) ;
267266do
268- pod_nodename=$( $K get $ pod -ojsonpath={.spec.nodeName} -n $ OPERATOR_NAMESPACE)
267+ pod_nodename=$( $K get " ${ pod} " -ojsonpath={.spec.nodeName} -n " ${ OPERATOR_NAMESPACE} " )
269268 echo " Saving nvidia-bug-report from ${pod_nodename} ..."
270269
271- $K exec -n $ OPERATOR_NAMESPACE $ pod -- bash -c ' cd /tmp && nvidia-bug-report.sh' >&2 || \
270+ $K exec -n " ${ OPERATOR_NAMESPACE} " " ${ pod} " -- bash -c ' cd /tmp && nvidia-bug-report.sh' >&2 || \
272271 (echo " Failed to collect nvidia-bug-report from ${pod_nodename} " && continue)
273272
274- $K cp $ OPERATOR_NAMESPACE /$( basename $ pod) :/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \
273+ $K cp " ${ OPERATOR_NAMESPACE} " /$( basename " ${ pod} " ) :/tmp/nvidia-bug-report.log.gz /tmp/nvidia-bug-report.log.gz || \
275274 (echo " Failed to save nvidia-bug-report from ${pod_nodename} " && continue)
276275
277- mv /tmp/nvidia-bug-report.log.gz $ ARTIFACT_DIR /nvidia-bug-report_${pod_nodename} .log.gz
276+ mv /tmp/nvidia-bug-report.log.gz " ${ ARTIFACT_DIR} /nvidia-bug-report_${pod_nodename} .log.gz"
278277done
279278
280279echo " "
0 commit comments