码迷,mamicode.com
首页 > 其他好文 > 详细

监控脚本

时间:2016-06-09 06:14:44      阅读:386      评论:0      收藏:0      [点我收藏+]

标签:

zabbix监控脚本
技术分享
#!/bin/bash  
# Script to fetch ngstatus statuses for tribily monitoring systems  
# Author: krish@toonheart.com  
# License: GPLv2 
 
# Set Variables  
HOST=`/sbin/ifconfig eth2 | sed -n /inet /{s/.*addr://;s/ .*//;p}`  
PORT="80" 
 
# Functions to return ngstatus stats 
 
function active {  
/usr/bin/curl "http://$HOST:$PORT/ngstatus" 2> /dev/null| grep Active | awk {print $NF}  
} 
 
function reading {  
/usr/bin/curl "http://$HOST:$PORT/ngstatus" 2> /dev/null| grep Reading | awk {print $2}  
} 
 
function writing {  
/usr/bin/curl "http://$HOST:$PORT/ngstatus" 2>/dev/null| grep Writing | awk {print $4}  
} 
 
function waiting {  
/usr/bin/curl "http://$HOST:$PORT/ngstatus" 2> /dev/null| grep Waiting | awk {print $6}  
} 
 
function accepts {  
/usr/bin/curl "http://$HOST:$PORT/ngstatus" 2> /dev/null| awk NR==3 | awk {print $1}  
} 
 
function handled {  
/usr/bin/curl "http://$HOST:$PORT/ngstatus" 2> /dev/null| awk NR==3 | awk {print $2}  
} 
 
function requests {  
/usr/bin/curl "http://$HOST:$PORT/ngstatus" 2> /dev/null| awk NR==3 | awk {print $3}  
} 
 
# Run the requested function  
case "$1" in 
active)
    active
    ;;
reading)
    reading
    ;;
writing)
    writing
    ;;
waiting)
    waiting
    ;;
accepts)
    accepts
    ;;
handled)
    handled
    ;;
requests)
    requests
    ;;
*)
    echo "Usage: $0 {nginx_site_discovery}"
    echo "Usage: $0 {active [host]|reading [host]|writing [host]|accepts [host]|handled [host]|requests [host]}"
esac
Nginx
技术分享
#!/usr/bin/env python
#coding:utf-8
import sys
import os
import traceback

import thrift
from thrift import protocol, transport
from thrift.transport import TTransport
from thrift.protocol import TBinaryProtocol
from fb303 import *
from fb303.ttypes import *
from scribeadmin.sina_fb303_mgmt import fb303_wrapper

descriptors = []
trans_factory = TTransport.TFramedTransportFactory()
prot_factory = TBinaryProtocol.TBinaryProtocolFactory()

def metric_handler(name):
    global trans_factory, prot_factory
    host = 127.0.0.1
    port = 1463
    tmp = name.split("_")
    metric = ‘‘
    if tmp[1] == "received":
        metric = "received good"
    elif tmp[1] == "sent":
        metric = "sent"
    elif tmp[1] == "retries":
        metric = "retries"
    elif tmp[1] == "reconnections":
        metric = "number of reconnections"
    else:
        pass
    try:
        retdict = fb303_wrapper(counters, host, port, trans_factory,prot_factory)
        return retdict[metric]
    except IOError,KeyError:
        return 0

def metric_init(params):
    global descriptors
    d1 = {name: scribe_received_good,
            call_back: metric_handler,
            time_max: 90,
            value_type: uint,
            units: ‘‘,
            slope: positive,
            format: %u,
            description: Received good log entrys,
            groups: scribe}
    d2 = {name: scribe_sent,
            call_back: metric_handler,
            time_max: 90,
            value_type: uint,
            units:‘‘,
            slope:positive,
            format:%u,
            description:HDFS bytes written,
            groups:scribe}
    d3 = {name: scribe_retries,
            call_back: metric_handler,
            time_max: 90,
            value_type: uint,
            units:‘‘,
            slope:both,
            format:%u,
            description:retries for queue size,
            groups:scribe}
    d4 = {name: scribe_reconnections,
            call_back: metric_handler,
            time_max: 90,
            value_type: uint,
            units:‘‘,
            slope:both,
            format:%u,
            description:number of reconnections,
            groups:scribe}
    descriptors = [d1,d2,d3,d4]
    return descriptors
    #print descriptors

def metric_cleanup():
    ‘‘‘Clean up the metric module.‘‘‘
    pass

#This code is for debugging and unit testing
if __name__ == __main__:
    metric_init({})
    for d in descriptors:
        v = d[call_back](d[name])
           print value for %s is %u % (d[name], v)
suda web scribe
技术分享
 1 #!/usr/bin/env python
 2 #coding:utf-8
 3 import sys
 4 import os
 5 import traceback
 6 
 7 import thrift
 8 from thrift import protocol, transport
 9 from thrift.transport import TTransport
10 from thrift.protocol import TBinaryProtocol
11 from fb303 import *
12 from fb303.ttypes import *
13 from scribeadmin.sina_fb303_mgmt import fb303_wrapper
14 
15 descriptors = []
16 trans_factory = TTransport.TFramedTransportFactory()
17 prot_factory = TBinaryProtocol.TBinaryProtocolFactory()
18 
19 def metric_handler(name):
20     global trans_factory, prot_factory
21     host = 127.0.0.1
22     port = 1463
23     tmp = name.split("_")
24     metric = ‘‘
25     if tmp[1] == "received":
26         metric = "received good"
27     elif tmp[1] == "sent":
28         metric = "sent"
29     elif tmp[1] == "denied":
30         metric = "denied for queue size"
31     else:
32         pass
33     try:
34         retdict = fb303_wrapper(counters, host, port, trans_factory,
35                 prot_factory)
36         return retdict[metric]
37     except IOError:
38         return 0
39 
40 def metric_init(params):
41     global descriptors
42     d1 = {name: scribe_received_good,
43             call_back: metric_handler,
44             time_max: 90,
45             value_type: uint,
46             units: ‘‘,
47             slope: positive,
48             format: %u,
49             description: Received good log entrys,
50             groups: scribe}
51     d2 = {name: scribe_sent,
52             call_back: metric_handler,
53             time_max: 90,
54             value_type: uint,
55             units:‘‘,
56             slope:positive,
57             format:%u,
58             description:HDFS bytes written,
59             groups:scribe}
60     d3 = {name: scribe_denied_for_queue_size,
61             call_back: metric_handler,
62             time_max: 90,
63             value_type: uint,
64             units:‘‘,
65             slope:both,
66             format:%u,
67             description:Denied for queue size,
68             groups:scribe}
69     descriptors = [d1,d2,d3]
70     return descriptors
71 
72 def metric_cleanup():
73     ‘‘‘Clean up the metric module.‘‘‘
74     pass
75 
76 #This code is for debugging and unit testing
77 if __name__ == __main__:
78     metric_init({})
79     for d in descriptors:
80         v = d[call_back](d[name])
81         print value for %s is %u % (d[name], v)
scribe relay

 

日常记录

技术分享
#!/usr/bin/python2.6
#-*- encoding:utf-8 -*-

from __future__ import division
import datetime,MySQLdb
import smtplib    
from email.MIMEText import MIMEText 

sender = datamonitor@staff.sina.com.cn    
receiver = [huming@staff.sina.com.cn,yantao1@staff.sina.com.cn,yongsan@staff.sina.com.cn, zhaobing@staff.sina.com.cn, data_qa@staff.sina.com.cn]
subject = Rsync log size 2.0
smtpserver = mail.staff.sina.com.cn    
username = datamonitor@staff.sina.com.cn    
password = 1234.com

def getValue(size):
    if size > 1073741824:
        return %2.2fG %(size/1024/1024/1024)
    if 1048576 < size and size < 1073741824:
        return %2.2fM %(size/1024/1024)      
    if 1024 < size and size < 1048576:
        return %2.2fK %(size/1024)  
    if 0 < size and size < 1024:
        return %2.0fB %(size)
    else:
        return 目录为空

def getPercent(value1,value2):
    if value2 == 0:
        return 0
    else:
        return "%0.2f" % ((value1-value2)/value2*100)

def main():
    try:
        conn = MySQLdb.connect(host=10.39.2.120,user=monitor,passwd=123qwe,db=monitor_v2,port=3306)
    except:
        print "Could not connect to MySQL server."
        exit(0)
    
    try:
        cur = conn.cursor()
        cur.execute(SET autocommit=1)
        cur.execute("SET NAMES ‘utf8‘")
        date = datetime.date.today() + datetime.timedelta(-1)
        dt = date.strftime(%Y-%m-%d)
        sql = ‘‘‘SELECT l.log_name,lc.check_time_type,lc.filepath,SUM(lc.filesize) AS sum  FROM log_size_check_record AS lc JOIN workorder_info AS w ON lc.log_id = w.l
og_id JOIN log_info_list AS l ON lc.log_id = l.log_id WHERE w.`status` = 0 AND DATE(lc.dt) = %s AND l.log_type = local GROUP BY log_name ORDER BY sum DESC ‘‘‘
        sql1 = ‘‘‘SELECT l.log_name,lc.check_time_type,lc.filepath,SUM(lc.filesize) AS sum  FROM log_size_check_record AS lc JOIN workorder_info AS w ON lc.log_id = w.
log_id JOIN log_info_list AS l ON lc.log_id = l.log_id WHERE w.`status` = 0 AND DATE(lc.dt) = %s AND l.log_type = local GROUP BY log_name ORDER BY sum DESC LIMIT 1
0‘‘‘
        sql2 = ‘‘‘SELECT l.log_id,l.log_name,lc.check_time_type,l.receive_module,SUM(lc.filesize) AS sum  FROM log_size_check_record AS lc JOIN workorder_info AS w ON 
lc.log_id = w.log_id JOIN log_info_list AS l ON lc.log_id = l.log_id WHERE w.`status` = 0 AND DATE(lc.dt) = %s AND l.log_type = local GROUP BY log_name ORDER BY su
m DESC LIMIT 10‘‘‘
        sql3 = ‘‘‘SELECT l.log_name,l.receive_module,u.user_name,check_time_type,SUM(lc.filesize) AS sum FROM log_info_list AS l JOIN log_size_check_record AS lc ON l.
log_id = lc.log_id JOIN log_user_relation lu ON l.log_id = lu.log_id AND lu.type = fz AND lu.level = 1 JOIN all_user_info u ON lu.user_id = u.user_id WHERE l.log_id 
= %s and DATE(lc.dt) IN (%s,%s,%s) GROUP BY DATE(lc.dt) DESC‘‘‘
        context = ""
        context += "<B>一天概览</B><BR><BR>"
        context += "<table border=1><tr><td>日期</td><td>日志大小</td><td>日环比</td><td>波动(%)</td><td>周同比</td><td>波动(%)</td></tr>"
    
        dtl = []
        size = []
        for i in [1,2,7]:
            sum = 0
            date1 = datetime.date.today() + datetime.timedelta(-i)
            dt1 = date1.strftime(%Y-%m-%d)
            dtl.append(dt1)
            cur.execute(sql,dt1)
            for p in cur.fetchall():
                sum += int(p[-1])
            size.append(sum)
        context += "<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" % (dtl[0],getValue(size[0]),getValue(size[1]),getPercent(size[0],size[
1]),getValue(size[2]),getPercent(size[0],size[2]))
        context += "</table>"
    
        context += "<BR><B>前一天日志大小前十位</B><BR><BR>"
        context += "<table border=1><tr><td>日志名称</td><td>模块名称</td><td>日志联系人</td><td>入库类型</td><td>日志大小</td><td>环比</td><td>波动(%)</td><td>周同比<
/td><td>波动(%)</td>"
    
        log_id = []
        cur.execute(sql2,dt)
        for p in cur.fetchall():
            log_id.append(p[0])
        #log_id = [96888fef1c2640a64ddfd3c746511937]
    
        for id in log_id:
            context += "<tr>"
            cur.execute(sql3,(id,dtl[0],dtl[1],dtl[2]))
            t = cur.fetchall()
            
            if len(t) == 1:
                b = list(t[0])
                b += [0,0,0,0]
            if len(t) == 2:
                b = list(t[0])
                b.append(t[1][-1])
                b.append(getPercent(t[0][-1],t[1][-1]))
                b += [0,0]
            if len(t) == 3:
                b = list(t[0])
                b.append(t[1][-1])
                b.append(getPercent(t[0][-1],t[1][-1]))
                b.append(t[2][-1])
                b.append(getPercent(t[0][-1],t[2][-1]))
            l = len(b)
            for j in range(l):
                if j == 4 or j == 5 or j == 7:
                    b[j] = getValue(int(b[j]))
            for j in range(l):
                context += "<td>%s</td>" % b[j]
            context += "</tr>"
        context += "</table>"
    
        cur.execute(sql,dt)
        context += "<BR><B>日志详情</B><BR><BR>"
        context += "<table border=1><tr><td>日志名称</td><td>入库类型</td><td>日志路径</td><td>日志大小</td></tr>"
        for p in cur.fetchall():
            context += "<tr>"
            l = len(p)
            for j in range(l):
                if j == 3:
                    context += "<td>%s</td>" % getValue(int(p[j]))
                else:
                    context += "<td>%s</td>" % p[j]
            context += "</tr>"
        context += "</table>"
        msg = MIMEText(context,html,utf-8)    
            
        msg[Subject] = subject    
        msg[To] = ",".join(receiver)
        msg[From] = sender
            
        try:
            smtp = smtplib.SMTP()    
            smtp.connect(smtpserver)    
            smtp.login(username, password)    
            smtp.sendmail(sender, receiver, msg.as_string())    
            smtp.quit()
        except Exception,e:
            print str(e)
    
    except MySQLdb.Error,e:
        print "Mysql Error %d: %s" %(e.args[0],e.args[1])

if __name__ == __main__:
    main()
checkmfs_v2.py
技术分享
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# Author:zifeng

from __future__ import division
import os
import sys
import smtplib    
from email.MIMEText import MIMEText 

sender = datamonitor@staff.sina.com.cn    
#receiver = [zifeng@staff.sina.com.cn]
receiver = [huming@staff.sina.com.cn, yongsan@staff.sina.com.cn, yantao1@staff.sina.com.cn, zhaobing@staff.sina.com.cn]
cc = [data_qa@staff.sina.com.cn]
subject = MFS空间使用情况    
smtpserver = mail.staff.sina.com.cn    
username = datamonitor@staff.sina.com.cn    
password = 1234.com

def getDir(dir, dep):
    cmd = ‘‘‘find %s -maxdepth %s -mindepth %s -type d‘‘‘ % (dir, dep, dep)
    return [x.strip() for x in os.popen(cmd).readlines()]

def getValue(size):
    if size >1099511627776:
        return %2.2fT %(size/1024/1024/1024/1024)
    if 1073741824 < size and size < 1099511627776:
        return %2.2fG %(size/1024/1024/1024)
    if 1048576 < size and size < 1073741824:
        return %2.2fM %(size/1024/1024)      
    if 1024 < size and size < 1048576:
        return %2.2fK %(size/1024)  
    if 0 < size and size < 1024:
        return %2.0fB %(size)
    else:
        return 目录为空

def dirInfo(dir1):
    cmd = /usr/local/mfs/bin/mfsdirinfo
    if not os.path.exists(cmd):
        print Please check mfs conmand was install
    else:
        cmd1 = cmd +   + dir1
        p = os.popen(cmd1).readlines()
        a = []
        for i in p:
            if i.startswith(/):
                continue
            else:
                a.append(i.strip().split()[1])
        a.append(dir1)
        return a

def main():
    if len(sys.argv) != 3:
        print Please use %s + Dirname + depth % sys.argv[0]
        sys.exit(1)
    if not os.path.isdir(sys.argv[1]):
        print Error,The input path is not exists
        sys.exit(2)
    else:
        context = ‘‘
        context += "<B>MFS目录详细信息</B><BR><BR>"
        context += "<tr>将统计%s的%s级目录数据</tr><BR>" % (sys.argv[1], sys.argv[2])
        for i in range(1, int(sys.argv[2]) + 1):
            print In depth %s % i
            context += "<BR><B>" + In depth  + str(i) + "</B><BR><BR>"
            context += "<table border=1><tr><td>inode</td><td>directories</td><td>files</td><td>chunks</td><td>length</td><td>size</td><td>realsize</td><td>路径</td></
tr>"
            l = getDir(sys.argv[1], i)
            p = []
            for x in l:
               p.append(dirInfo(x))
            p = sorted(p, cmp=lambda x, y: cmp(int(x[6]), int(y[6])), reverse=True)
            for d in p:
                l = len(d)
                context += "<tr>"
                for j in range(l):
                    if j == 5 or j == 6:
                        context += "<td>%s</td>" % getValue(int(d[j]))
                    else:
                        context += "<td>%s</td>" % d[j]
                context += "</tr>"
            context += "</table>"

        msg = MIMEText(context,html,utf-8)    
        msg[Subject] = subject    
        msg[To] = ",".join(receiver)
        msg[Cc] = ",".join(cc)
        msg[From] = sender
        try:
            smtp = smtplib.SMTP()    
            smtp.connect(smtpserver)    
            smtp.login(username, password)    
            smtp.sendmail(sender, receiver, msg.as_string())    
            smtp.quit()
        except Exception,e:
            print str(e)

if __name__ == "__main__":
    main()
dirinfo.py
技术分享
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import os
import re
import datetime,MySQLdb

def replaceDate(str,dt):
    lower = [y,m,d,h,i]
    yy = dt[2:4]
    yyyy = dt[0:4]
    mm = dt[4:6]
    dd = dt[6:8]
    hh = dt[8:10]
    mi = dt[10:12]
    regex = ur\[.*?\]
    match = re.findall(regex, str)
    rrr = ‘‘
    for r in match: 
        rrr = r.upper()
        rrr = rrr.replace(MI,mi)
        rrr = rrr.replace(HH,hh)
        rrr = rrr.replace(YYYY,yyyy)
        rrr = rrr.replace(YY,yy)
        rrr = rrr.replace(MM,mm)
        rrr = rrr.replace(DD,dd)
        str = str.replace(r,rrr)
    str = str.replace([,‘‘)
    str = str.replace(],‘‘)
    return str

def main():
    try:
        conn = MySQLdb.connect(host=10.39.2.120,user=monitor,passwd=123qwe,db=monitor_v2,port=3306)
    except:
        print "Could not connect to MySQL server."
        exit(0)
    
    try:
        cur = conn.cursor()
        date = datetime.date.today() + datetime.timedelta(4)
        dt = date.strftime(%Y%m%d)
        sql = """SELECT l.fullpath FROM log_info_list l JOIN workorder_info w ON l.log_id = w.log_id WHERE w.`status` = ‘0‘ AND l.push_type = ‘rsync‘"""
        cur.execute(sql)
        for p in cur.fetchall():
            destDir = replaceDate(p[0],dt)
            if not os.path.exists(destDir):
                os.makedirs(destDir)
            else:
                print %s is exists % destDir
        conn.close()

    except MySQLdb.Error,e:
        print "Mysql Error %d: %s" %(e.args[0],e.args[1])

if __name__ == "__main__":
    main()
mkdir_for_nextmonth.py
技术分享
#!/usr/bin/env python
# coding: utf-8
import os
import time
import sys
import MySQLdb
import time
import re
import datetime
sys.path.append("/usr/local/jobclient/bin/python/lib")
import tools


try:
    #连接mysql并查询表
    conn = MySQLdb.connect(host=10.39.2.120,user=monitor,passwd=123qwe,db=monitor_v2,port=3306)
    cur  = conn.cursor()
    #sql = "select org_path,hdfs_path from log_put_conf where status = 0;"
    sql = "select org_path,hdfs_path from log_put_conf as l JOIN workorder_info w on l.log_id = w.log_id WHERE w.`status` = ‘0‘"
    cur.execute(sql)
    result = cur.fetchall()

    #时间变量和工具引用
    tool = tools.tools()    
    today = datetime.date.today()
    one_day = datetime.timedelta(days=1)
    two_day = datetime.timedelta(days=2)
    three_day = datetime.timedelta(days=3)
    four_day = datetime.timedelta(days=4)
    five_day =  datetime.timedelta(days=5)
    six_day = datetime.timedelta(days=6)
    seven_day = datetime.timedelta(days=7)
    
    day1 = today - one_day
    day2 = today - two_day
    day3 = today - three_day
    day4 = today - four_day
    day5 = today - five_day
    day6 = today - six_day
    day7 = today - seven_day

    dt_1 = day1.strftime(%Y%m%d)
    dt_2 = day2.strftime(%Y%m%d)
    dt_3 = day3.strftime(%Y%m%d)
    dt_4 = day4.strftime(%Y%m%d)
    dt_5 = day5.strftime(%Y%m%d)
    dt_6 = day6.strftime(%Y%m%d)
    dt_7 = day7.strftime(%Y%m%d)
    
    mon = "date +%Y%m -d ‘+1 month ago‘"
    mon_new = tool.execu(mon)
        ye = "date +%Y"
        ye_new = tool.execu(ye)
    
    old_file = os.path.exists(/tmp/list.conf_%s % dt_7)
    if old_file == True:
        os.remove(/tmp/list.conf_%s % dt_7)
    else:
        print list_conf_%s file no exits % dt_7
    
    old_conf = os.path.exists(/tmp/list.conf)
    if old_conf == True:
        os.remove(/tmp/list.conf)
    else:
        print list_conf file no exits

    #循环读取表中路径信息
    for line in result: 
        a0 = line[0]
        b0 = tool.replaceDate(a0,dt_1)
        
        a1 = line[1]
        b1 = tool.replaceDate(a1,dt_1)
        #查看HDFS目录大小
        h_size = "su - data_qa -s /bin/bash -c \"hadoop fs -dus %s\" | awk ‘{print $2}‘" % b1
        h_size = tool.execu(h_size)
        #检查HDFS目录是否存在
        h_dir = "su - data_qa -s /bin/bash -c \"hadoop fs -test -d %s\"" % b1
        h_dir = os.system(h_dir)

        Y = "YY"
        M = "MM"
#        regex = ur\[.+\].*
#        ma = re.findall(regex,%s % (a0))
#        print ma[0]
        if h_dir == 0 and h_size > 0:
            
            if Y in a0:
                regex = ur\[.+\].*
                ma = re.findall(regex,%s % (a0))
                new_b0 = a0.replace(ma[0], )

                d1 = tool.replaceDate(ma[0],dt_1)
                d2 = tool.replaceDate(ma[0],dt_2)
                d3 = tool.replaceDate(ma[0],dt_3)
                d4 = tool.replaceDate(ma[0],dt_4)
                d5 = tool.replaceDate(ma[0],dt_5)
                d6 = tool.replaceDate(ma[0],dt_6)
                d7 = tool.replaceDate(ma[0],dt_7)
                os.system(find %s -mtime +7 -type d | grep \‘[0-9]$\‘ | egrep -v \‘%s|%s|%s|%s|%s|%s|%s|%s$\‘ >> /tmp/list.conf % (new_b0,d1,d2,d3,d4
,d5,d6,d7,ye_new))
            else:
                #print %s a0 dir not YY % a0
                os.system(find %s -type f -mtime +7 >> /tmp/list.conf % (a0))
    #去掉重复的行
    os.system(sort -u /tmp/list.conf | sort -rn >> /tmp/list.conf_%s % (dt_7))

    cur.close()
    conn.close()
except MySQLdb.Error,e:
    print "MySQL Error Msg:"
#删除目录
os.system(/bin/sh /usr/home/liguo/test/del_mail.sh)
mfs_clean.py
技术分享
#!/usr/bin/env python
# -*- encoding:utf-8 -*-

import os
import sys
import time
import socket
import MySQLdb
#os.environ["TEMP"]

service = 监控系统
level = CRITICAL
current_time = (time.strftime(%Y-%m-%d %H:%M:%S))

#查询kafkaproxy主机正常连接到zk的ip
def getAliveHosts():
    cmd = ‘‘‘/bin/sh /usr/local/zookeeper-3.4.3/bin/zkCli.sh -server 10.39.1.66:22181 ls /kafkaProxy8 2> /dev/null | grep -P "^\["‘‘‘
    #cmd = ‘‘‘echo "ls /kafkaProxy8" > script && /usr/local/zookeeper-3.4.3/bin/zkCli.sh -server 10.39.1.66:22181 <script 2>/dev/null |grep yz |grep -v CONNECTED‘‘‘
    #cmd = "/bin/sh /usr/home/data_qa/shell/zk_monitor.sh"
    try:
        hosts1 = [x.split(":")[0] for x in os.popen(cmd).readline().strip().strip([]).split()]
    hosts2 = ‘‘
    hosts = []
    for p in hosts1:
                hosts2 = socket.gethostbyname(p)
        hosts.append(hosts2)
    #hosts = [10.39.4.204, 10.39.4.205, 10.39.4.206, 10.39.4.208, 10.39.4.209, 10.39.4.210, 10.39.4.211, 10.39.4.219, 10.39.4.201, 10.39.4.220,
 10.39.4.216, 10.39.4.217, 10.39.4.213, 10.39.4.215, 10.39.4.218, 10.39.4.212]
    #print len(hosts)
    #print hosts
        return hosts
    except:
        print get error

def Get_Dict(sql):
    conn = MySQLdb.connect(host=10.39.2.120,user=monitor,passwd=123qwe,db=monitor_v2,port=3306)
    cur = conn.cursor()
    cur.execute(SET autocommit=1)
        cur.execute("SET NAMES ‘utf8‘")
    reCount = cur.execute(sql)
    data = cur.fetchall()
    cur.close()
    conn.close()
    return data

def getLocalIp():
    cmd = """/sbin/ifconfig eth1 | awk -F[" ":]+ ‘/inet addr/{print $4}‘"""
    try:
        ip = os.popen(cmd).readline().strip()
        return ip
    except:
        print get ip error

def getReceiver():
    receiver = yongsan,jiangyu2,hongtao4,zhichao1,yantao1,zhaobing
    #receiver = yongsan
    #cc = yongsan
    cmd = "/usr/bin/curl http://monitor.pso.sina.com.cn/monitor/index.php/interface/internal/getDutyUser"
    try:
        duty_user = os.popen(cmd).readline()
        if duty_user:
            receiver = receiver + , + duty_user
    except:
        print "get user error"
    return receiver

def sendsms(recs, service, level, subject):
    sms=http://monitor.pso.sina.com.cn/monitor/index.php/interface/sendSMS
    cmd = ‘‘‘/usr/bin/curl -d receivers=%s -d service=%s -d level=%s -d subject="%s" %s ‘‘‘ % (recs, service, level, subject, sms)
    os.system(cmd)

def sendmail(recs, service, level, subject, con):
    mail = http://monitor.pso.sina.com.cn/monitor/index.php/interface/sendMail
    cmd = ‘‘‘/usr/bin/curl -d receivers=%s -d service=%s -d level=%s -d subject="%s" -d content="%s" %s ‘‘‘ % (recs, service, level, subject, con, mail)
    os.system(cmd)

#查询监控机器
sql =  ‘‘‘SELECT s.server_ip FROM scribe_server AS s,scribe_monitor As m where s.server_id=m.server_id and s.descriptor=新kafka集群 AND m.enabled=
1 ‘‘‘

simple_data = Get_Dict(sql)
sum = ‘‘
AliveHost = []
for i in simple_data:
    sum = i[0].split(:)[0]
    AliveHost.append(sum)
#print sum
#print AliveHost
ac = len(AliveHost)
#print ac

message=‘‘

def main():
    count = 0
    test = getReceiver()
    line="http://general.wiki.erp.sina.com.cn/Monitor_FAQ/zk_counters"
    while True:
        hostList = getAliveHosts()
    #print hostList
        try:
            if len(hostList) < ac:
                count += 1
                if count % 3 == 1:
                    #message = <tr><td>kafkaproxy Host: 未连接到zk,请注意查看!\n<td/><tr/><br>
                    #sms_mesg = kafkaproxy Host未连接到zk,请注意查看\n
            ip=‘‘
                    for i in AliveHost:
                        if i not in hostList:
                            print i
                ip += i +  
                #ip += socket.gethostbyname(i)
                message = <tr><td>[%s] 主机:%s(kafkaproxy)未连接到zk,请注意查看!\n<BR> 处理方法<a href=%s>见这里</a>\n<td/><tr/><br> % (current_time,ip,
line)
                sms_mesg = [%s] kafkaproxy Host:%s 未连接到zk,请注意查看\n % (current_time,ip)
                    print "sms+email"
                    sendsms(test, service, level, sms_mesg) 
                else:
                    print "email"
                sendmail(test, service, level, "kafkaproxy连接zookeeper报警", message) 
            else:
                print "All server is ok"
            #time.sleep(600)
        break
        except KeyboardInterrupt:
            break

if __name__ == "__main__":
    main()
kafkaproxy.py
技术分享
#!/bin/bash
#监控zk节点上面的scirbe后端服务器信息
source ~/.bash_profile

sms=http://monitor.pso.sina.com.cn/monitor/index.php/interface/sendSMS
mail=http://monitor.pso.sina.com.cn/monitor/index.php/interface/sendMail
receiver=yongsan,jiangyu2,hongtao4,zhichao1,yantao1,zhaobing
#receiver=yongsan
duty_user=`curl http://monitor.pso.sina.com.cn/monitor/index.php/interface/internal/getDutyUser`
if [ $duty_user != "" ];then
   receiver="$receiver,$duty_user"
fi
dir="/usr/home/data_qa/shell"
receive="$receiver"
#echo $receive
pid="$dir/pid"
/bin/echo $$ >> $pid
service=监控系统
level=CRITICAL
message=‘‘
local_ip=$(/sbin/ifconfig eth1 | awk -F[" ":]+ /inet addr/{print $4})

alive_ip=""
alive_num=0
MYSQL=`/usr/bin/mysql -umonitor -h 10.39.2.120 -p123qwe --database monitor_v2 -e "SELECT s.server_ip FROM scribe_server AS s,scribe_monitor As m where s.server_id=m.se
rver_id and s.descriptor=hadoop2.0接收机 AND m.enabled=1;"`
for i in `echo $MYSQL`;do 
    alive_ip+="`echo $i |grep -v server_ip|awk -F: ‘{print $1}‘` ";
    alive_num=$(expr $alive_num + 1);
done
#echo "alive_ip:" $alive_ip
alive_num=$(expr $alive_num - 1)
echo "alive_num:" $alive_num

Ng_line=`echo "ls /ScribeCompactNg" > script &&  /usr/local/zookeeper-3.4.3/bin/zkCli.sh -server 10.39.1.66 <script 2>/dev/null |grep yz |grep -v CONNECTED |sed s/\[/
/g;s/\]//g‘ |awk -F, ‘{for(i=1;i<=NF;i++) print $i}‘`
#echo $Ng_line
Sc_Ng=""
Ng_num=0
for s in `echo $Ng_line`;do 
    Ng_ip=`echo $s|awk -F: {print $1}`; 
    Sc_Ng+="$(/bin/ping -c 1 $Ng_ip |grep PING |awk ‘{print $3}‘|sed ‘s/[()]//g‘) ";
    Ng_num=$(expr $Ng_num + 1);
done
echo "Ng_num:" $Ng_num
#echo "Sc_Ng" $Sc_Ng
#Sc_Ng="10.39.5.228 10.39.5.229 10.39.5.231 10.39.5.232 10.39.5.233 10.39.5.234 10.39.5.235 10.39.5.236 10.39.5.217 10.39.5.218 10.39.5.219 10.39.5.220 10.39.5.222 10.
39.5.223 10.39.5.224 10.39.5.225 10.39.5.226 10.39.4.101 10.39.4.103 10.39.4.104 10.39.4.106 10.39.4.107 10.39.4.105 10.39.4.108 10.39.4.109 10.39.4.110"

if [ $alive_num != $Ng_num ] ; then
    ip2=‘‘
    for ap in `echo $alive_ip`;do
        num=`echo $Sc_Ng|grep $ap`
        if [ "$num" == "" ];then
            ip2+=,$ap
            message="[$(date +‘%F %T‘)] 报警内容:正常连接到zk个数为: "$Ng_num"个,未连接到zk的ip为$ip2 <BR> 处理方法<a href="http://general.wiki.erp.sina.c
om.cn/Monitor_FAQ/zk_counters">见这里</a>"
            message1="[`date ‘+%F %T‘`]scribe2.0接收集群ip:$ip2 未连接到zk,请注意查看!"
        fi
    done
else
        test -f $pid && /bin/rm -f $pid > /dev/null
    exit 0
fi
if [ "$message" != "" ]; then
  if [ -e $pid ];then
       if [ "$(cat $pid |wc -l)" -eq "1" ];then
               curl -d receivers=$receiver -d service=$service -d level=$level -d subject="zookeeper连接报警" -d content="$message" $mail
               curl -d receivers=$receiver -d service=$service -d level=$level -d subject="$message1" $sms
       elif [ $(cat $pid |wc -l) -eq 2 ];then
               curl -d receivers=$receiver -d service=$service -d level=$level -d subject="zookeeper连接报警" -d content="$message" $mail
       elif [ $(cat $pid |wc -l) -eq 3 ];then
               curl -d receivers=$receiver -d service=$service -d level=$level -d subject="zookeeper连接报警" -d content="$message" $mail
               /bin/rm -f $pid > /dev/null
       else
               curl -d receivers=$receiver -d service=$service -d level=$level -d subject="zookeeper连接报警" -d content="$message" $mail
               /bin/rm -f $pid > /dev/null
        fi
  else
    echo "pid is not exist,exit!"
  fi
fi
zk_scribe_Ng.sh

 

监控脚本

标签:

原文地址:http://www.cnblogs.com/liyongsan/p/5571848.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!