背景:由于线上服务器硬盘故障,导致服务,数据失效;
目的:保证第一时间发现硬盘信息是否正常;
方案:使用Nagios 自定义脚本来监控硬盘状态;
注意:下面脚本只提供已经安装hpacucli megacli 软件使用;
#!/bin/bash #Marc.wang 2014/06/17 export PATH=$PATH:/usr/sbin/:/sbin/:/usr/bin/ Get_localhost_Hostname=`hostname -I |awk ‘{print $1}‘` Nagios="nagios.org" SERVER_TYPE=$(/usr/sbin/dmidecode | grep "Vendor" | awk -F\: ‘NR==1{print $2}‘|awk ‘{print $1}‘) #The nagios command run nsca Send_nsca_ssl_message (){ /usr/local/nagios/bin/send_nsca -H ${Nagios} -d ";" -c /usr/local/nagios/etc/send_nsca.cfg } #hp command run HP_DISK_STATUS_COMMAND() { rpm -qa |grep hpacucli >> /dev/null 2>&1 echo $? } # dell command run DELL_IBM_DISK_STATUS_COMMAND() { rpm -qa |grep MegaCli >> /dev/null 2>&1 echo $? } bug_test=$(ps ax |grep hpacucli |grep -v grep |wc -l) if [ "${bug_test}" != "0" ] then echo "$Get_localhost_Hostname;check_raid;2; hpacucli command run not data." | Send_nsca_ssl_message exit 2 fi CHECK_RAID_STATUS_HP () { /usr/sbin/hpacucli ctrl all show config detail |grep physicaldrive -A 4 |sed ‘s/ //g‘|grep "Status:" |grep -v "Status:OK" | wc -l } case $SERVER_TYPE in HP|hp|Hp|Hewlett-Packard) TEST_HP_COMMAND () { hpacucli ctrl all show config detail >> /dev/null 2>&1 echo $? } HP_RPM=$(HP_DISK_STATUS_COMMAND) sleep 3 if [ ${HP_RPM} != "0" ] then echo "$Get_localhost_Hostname;check_raid;2; $SERVER_TYPE command hpacucli Not Found" | Send_nsca_ssl_message exit 2 elif [[ ${HP_RPM} == "0" ]]; then HP_RAID_STATUS_NUMBER=$(CHECK_RAID_STATUS_HP) sleep 3 TEST_HP=$(TEST_HP_COMMAND) if [ "$HP_RAID_STATUS_NUMBER" == "0" ] && [ "$TEST_HP" == "0" ]; then echo "$Get_localhost_Hostname;check_raid;0;Check_Raid_status:OK" | Send_nsca_ssl_message exit 0 elif [ "${TEST_HP}" != "0" ] then echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status: run command hpacucli Error" | Send_nsca_ssl_message exit 2 elif [ "$HP_RAID_STATUS_NUMBER" != "0" ] && [ "$TEST_HP" == "0" ] then echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:Critical" | Send_nsca_ssl_message exit 2 fi fi ;; DELL|Dell|DEll|DeLL|dell|IBM|ibm|Ibm|IBm) if [ -f "/opt/MegaRAID/MegaCli/MegaCli64" ]; then CHECK_RAID_STATUS_IBM_DELL () { /opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -a0 | grep -E "(Media Error Count:|Other Error Count:)" |awk -F: ‘{sum1 += $2} END {print sum1}‘ } TEST_DELL_COMMAND (){ /opt/MegaRAID/MegaCli/MegaCli64 -LdPdInfo -a0 >> /dev/null echo $? } else CHECK_RAID_STATUS_IBM_DELL () { MegaCli -LdPdInfo -a0 | grep -E "(Media Error Count:|Other Error Count:)" |awk -F: ‘{sum1 += $2} END {print sum1}‘ } TEST_DELL_COMMAND (){ MegaCli -LdPdInfo -a0 >> /dev/null echo $? } fi IBM_DELL_RPM=$(DELL_IBM_DISK_STATUS_COMMAND) if [[ ${IBM_DELL_RPM} == "0" ]] then TEST_OTHER_COMMAND=$(TEST_DELL_COMMAND) DELL_IBM_STATUS_NUMBER=$(CHECK_RAID_STATUS_IBM_DELL) if [[ -z "$DELL_IBM_STATUS_NUMBER" ]] then echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:MegaCli Command Not Found!" | Send_nsca_ssl_message exit 2 elif [[ "$DELL_IBM_STATUS_NUMBER" -gt "2000" ]] ; then echo "$Get_localhost_Hostname;check_raid;2;Check_Raid_status:Critical" | Send_nsca_ssl_message exit 2 elif [[ "$DELL_IBM_STATUS_NUMBER" -lt "2000" ]] && [[ "$TEST_OTHER_COMMAND" == "0" ]] then echo "$Get_localhost_Hostname;check_raid;0;Check_Raid_status:OK" | Send_nsca_ssl_message exit 0 fi fi ;; *) echo "$Get_localhost_Hostname;check_raid;2;This machine is not IBM DELL or HP!" | Send_nsca_ssl_message ;; esac
本文出自 “chinaops” 博客,请务必保留此出处http://cnops.blog.51cto.com/9374660/1551793
原文地址:http://cnops.blog.51cto.com/9374660/1551793