aboutsummaryrefslogtreecommitdiff
path: root/text_collector_examples/mellanox_hca_temp
blob: 0a9e2b0ced2b2732f8beb4c77e5aaf3709f76b79 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/bin/bash
set -eu

# Script to read Mellanox HCA temperature using the Mellanox mget_temp_ext tool

# Copyright 2018 The Prometheus Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Author: Jan Phillip Greimann <jan.greimann@cloud.ionos.com>

# check if root
if [ "$EUID" -ne 0 ]; then
    echo "${0##*/}: Please run as root!" >&2
    exit 1
fi

# check if programs are installed
if ! command -v mget_temp_ext >/dev/null 2>&1; then
    echo "${0##*/}: mget_temp_ext is not installed. Aborting." >&2
    exit 1
fi

cat <<EOF
# HELP node_infiniband_hca_temp_celsius Celsius temperature of Mellanox InfiniBand HCA.
# TYPE node_infiniband_hca_temp_celsius gauge
EOF

# run for each found Mellanox device
for dev in /sys/class/infiniband/*; do
    if test ! -d "$dev"; then
        continue
    fi
    device="${dev##*/}"

    # get temperature
    if temperature="$(mget_temp_ext -d "${device}")"; then
        # output
        echo "node_infiniband_hca_temp_celsius{hca_device=\"${device}\"} ${temperature//[[:space:]]/}"
    else
        echo "${0##*/}: Failed to get temperature from InfiniBand HCA '${device}'!" >&2
    fi
done

# if device is empty, no device was found
if [ -z "${device-}" ]; then
    echo "${0##*/}: No InfiniBand HCA device found!" >&2
    exit 1
fi