β

linux监测CPU主板温度及告警

第四城社区 152 阅读
随着夏天的临近,伴随温度持续上升,随之而来机房里的服务器也开始“发烧”了。

为了避免线上业务服务器不会因为温度过高而宕机,提高业务的可用性,我们需要掌控服务器的温度
以及温度达到一定的高度能够及时告警通知,以便对服务器做进一步的处理防宕机。

安装环境:
服务器 DELL R710 
操作系统 CentOS 5.5
、安装硬件传感器监控软件sensors
yum install -y lm_sensors

rpm -qa|grep coretemp
如果未安装coretemp,则
wget http://www.pperry.f2s.com/linux/coretemp/kmod-coretemp-1.1-2.el5.x86_64.rpm
rpm -ivh kmod-coretemp-1.1-2.el5.x86_64.rpm
、运行sensors-detect进行传感器检测
sensors-detect  
PS:一路回车即可。
、运行sensors看是否能读取数据,查看温度
[root@DBserver ~]# sensors
w83793-i2c-0-2f
Adapter: SMBus I801 adapter at 1100
VcoreA:       +1.20 V  (min =  +0.92 V, max =  +1.49 V)
VcoreB:       +1.20 V  (min =  +0.92 V, max =  +1.49 V)
in2:          +1.09 V  (min =  +0.99 V, max =  +1.33 V)
in3:          +0.00 V  (min =  +0.38 V, max =  +0.69 V)  ALARM
in4:          +1.50 V  (min =  +1.34 V, max =  +1.98 V)
in5:          +3.28 V  (min =  +2.96 V, max =  +3.63 V)
in6:          +1.00 V  (min =  +0.90 V, max =  +1.10 V)
+5V:          +5.12 V  (min =  +4.64 V, max =  +5.65 V)
VSB:         +5.09 V  (min =  +4.64 V, max =  +5.65 V)
Vbat:         +3.31 V  (min =  +2.99 V, max =  +3.66 V)
fan1:        7142 RPM  (min =  712 RPM)
fan2:        9712 RPM  (min =  712 RPM)
fan3:        7219 RPM  (min =  712 RPM)
fan4:        9310 RPM  (min =  712 RPM)
fan5:        6923 RPM  (min =  712 RPM)
fan6:        9854 RPM  (min =  712 RPM)
fan7:        6994 RPM  (min =  712 RPM)
fan8:        9782 RPM  (min =  712 RPM)
fan9:           0 RPM  (min =  712 RPM)  ALARM
fan10:          0 RPM  (min =  712 RPM)  ALARM
temp1:        +52.0°C  (high = +87.0°C, hyst = +82.0°C)  sensor = Intel PECI
temp2:        +48.0°C  (high = +87.0°C, hyst = +82.0°C)  sensor = Intel PECI
temp3:       -128.0°C  (high = +87.0°C, hyst = +82.0°C)  sensor = Intel PECI
temp4:       -128.0°C  (high = +87.0°C, hyst = +82.0°C)  sensor = Intel PECI
temp5:        +35.0°C  (high = +75.0°C, hyst = +70.0°C)  sensor = thermistor
beep_enable: disabled
、配置Nagios告警
vi /usr/local/nagios/libexec/check_cputemp
#!/bin/sh
#########check_cputemp###########
#date : May 2013
#Licence GPLv2
#by Barlow
#/usr/local/nagios/libexec/check_cputemp
#you can use NRPE to define service in nagios
#check_nrpe!check_cputemp
# Plugin return statements
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
print_help_msg(){
$Echo "Usage: $0 -h to get help."
}
print_full_help_msg(){
$Echo "Usage:"
$Echo "$0 [ -v ] -m sensors -w cpuT -c cpuT"
$Echo "Sepicify the method to use the temperature data sensors."
$Echo "And the corresponding Critical value must greater than Warning value."
$Echo "Example:"
$Echo "${0} -m sensors -w 40 -c 50"
}
print_err_msg(){
$Echo "Error."
print_full_help_msg
}
to_debug(){
if [ "$Debug" = "true" ]; then
$Echo "$*" >> /var/log/check_sys_temperature.log.$$ 2>&1
fi
}
unset LANG
Echo="echo -e"
if [ $# -lt 1 ]; then
print_help_msg
exit 3
else
while getopts :vhm:w:c: OPTION
do
case $OPTION
in
v)
#$Echo "Verbose mode."
Debug=true
;;
m)
method=$OPTARG
;;
w)
WARNING=$OPTARG
;;
c)
CRITICAL=$OPTARG ;;
h)
print_full_help_msg
exit 3
;;
?)
$Echo "Error: Illegal Option."
print_help_msg
exit 3
;;
esac
done
if [ "$method" = "sensors" ]; then
use_sensors="true"
to_debug use_sensors
else
$Echo "Error. Must to sepcify the method to use sensors."
print_full_help_msg
exit 3
fi
to_debug All Values  are \" Warning: "$WARNING" and Critical: "$CRITICAL" \".
fi
#########lm_sensors##################
if [ "$use_sensors" = "true" ]; then
sensorsCheckOut=`which sensors 2>&1`
if [ $? -ne 0 ];then
echo $sensorsCheckOut
echo Maybe you need to check your sensors.
exit 3
fi
to_debug Use $sensorsCheckOut to check system temperature
TEMP1=`sensors | head -3 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP2=`sensors | head -4 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP3=`sensors | head -5 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
TEMP4=`sensors | head -6 | tail -1 | gawk '{print $3}' | grep -o [0-9][0-9]`
##温度的取数根据你cpu的核数确定,我的是四核,所以取TEMP1-4个CPU温度数并计算平均值
SUM=$(( $TEMP1 + $TEMP2 + $TEMP3 + $TEMP4 ))
TEMP=$(($SUM/4))
if [ -z "$TEMP" ] ; then
$Echo "No Data been get here. Please confirm your ARGS and re-check it with Verbose mode, 
then to check the log."
exit 3
fi
to_debug temperature data is $TEMP
else
$Echo "Error. Must to sepcify the method to use sensors"
print_full_help_msg
exit 3
fi
######### Comparaison with the warnings and criticals thresholds given by user############
CPU_TEMP=$TEMP
#if [ "$WARNING" != "0" ] || [ "$CRITICAL" != "0" ]; then
if [ "$CPU_TEMP" -gt "$CRITICAL" ]  && [ "$CRITICAL" != "0" ]; then
STATE="$STATE_CRITICAL"
STATE_MESSAGE="CRITICAL"
to_debug $STATE , Message is $STATE_MESSAGE
elif [ "$CPU_TEMP" -gt "$WARNING" ] && [ "$WARNING" != "0" ]; then
STATE="$STATE_WARNING"
STATE_MESSAGE="WARNING"
to_debug $STATE , Message is $STATE_MESSAGE
else
STATE="$STATE_OK"
STATE_MESSAGE="OK"
to_debug $STATE , Message is $STATE_MESSAGE
fi
##返回值中注意要包含性能数据,即采用|分隔的后半部数据
##且数据单位不能包含中文,否则使用PNP等绘图软件无法正常绘图。
echo "The TEMPERATURE "$STATE_MESSAGE" "-" The CPU's Temperature is "$CPU_TEMP" ℃ ! 
| 温度=`echo $CPU_TEMP`Celsius;$WARNING;$CRITICAL"
exit $STATE
、赋予上述脚本执行权限:
chmod +x  /usr/local/nagios/libexec/check_cputemp
、配置nrpe.cfg
command[check_cputemp]=/usr/local/nagios/libexec/check_cputemp -m sensors -w 40 -c 55
-w 表示警告值,-c表示关键(紧急)值,自行根据实际情况调整
、在Nagios服务器配置服务:
define service{
use             generic-service,svr-pnp
host_name
service_description CPU Temperature
check_command check_nrpe!check_cputemp
}
保存后重启nagios服务即可。

作者:第四城社区
专注、思考、创新
原文地址:linux监测CPU主板温度及告警, 感谢原作者分享。

发表评论