欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

阿里云监控服务器信息 博客分类: linux pythonlinux

程序员文章站 2024-02-24 17:01:22
...
#!/usr/bin/python
#########################################
# Function:    sample linux performance indices
# Usage:       python sampler.py
# Author:      CMS DEV TEAM
# Company:     Aliyun Inc.
# Version:     1.1
#########################################

import os
import os.path
import sys
import time
import operator
import httplib
import logging
import socket
import random
from shutil import copyfile
from subprocess import Popen, PIPE
from logging.handlers import RotatingFileHandler

logger = None
REMOTE_HOST = None
REMOTE_PORT = None
REMOTE_MONITOR_URI = None
UUID = None

def get_mem_usage_percent():
    try:
        f = open('/proc/meminfo', 'r')
        for line in f:
            if line.startswith('MemTotal:'):
                mem_total = int(line.split()[1])
            elif line.startswith('MemFree:'):
                mem_free = int(line.split()[1])
            elif line.startswith('Buffers:'):
                mem_buffer = int(line.split()[1])
            elif line.startswith('Cached:'):
                mem_cache = int(line.split()[1])
            elif line.startswith('SwapTotal:'):
                vmem_total = int(line.split()[1])
            elif line.startswith('SwapFree:'):
                vmem_free = int(line.split()[1])
            else:
                continue
        f.close()
    except:
        return None
    physical_percent = usage_percent(mem_total - (mem_free + mem_buffer + mem_cache), mem_total)
    virtual_percent = 0
    if vmem_total > 0:
        virtual_percent = usage_percent((vmem_total - vmem_free), vmem_total)
    return physical_percent, virtual_percent

black_list = ('iso9660',)

def usage_percent(use, total):
    try:
        ret = (float(use) / total) * 100
    except ZeroDivisionError:
        raise Exception("ERROR - zero division error")
    return ret

def get_disk_partition():
    return_list = []
    pd = []
    try:
        f = open("/proc/filesystems", "r")
        for line in f:
            if not line.startswith("nodev"):
                fs_type = line.strip()
                if fs_type not in black_list:
                    pd.append(fs_type)
        f.close()

        f = open('/etc/mtab', "r")
        for line in f:
            if line.startswith('none'):
                continue
            tmp = line.strip().split()
            ft = tmp[2]
            if ft not in pd:
                continue
            return_list.append(tmp[1])
        f.close()
    except:
        return None
    return return_list

def check_disk():
    try:
        return_dict = {}
        p_list = get_disk_partition()
        for i in p_list:
            dt = os.statvfs(i)
            use = (dt.f_blocks - dt.f_bfree) * dt.f_frsize
            all = dt.f_blocks * dt.f_frsize
            return_dict[i] = ('%.2f' % (usage_percent(use, all),), ('%.2f' % (all * 1.0 / (1024 * 1000000))))
    except:
        return None
    return return_dict

_CLOCK_TICKS = os.sysconf("SC_CLK_TCK")

def get_cpu_time():
    need_sleep = False
    if not os.path.isfile('/tmp/cpu_stat') or os.path.getsize('/tmp/cpu_stat') == 0:
        copyfile('/proc/stat', '/tmp/cpu_stat')
        need_sleep = True

    try:
        f1 = open('/tmp/cpu_stat', 'r')
        values1 = f1.readline().split()
        total_time1 = 0
        for i in values1[1:]:
            total_time1 += int(i)
        idle_time1 = int(values1[4])
        iowait_time1 = int(values1[5])
    finally:
        f1.close()

    if need_sleep:
        time.sleep(1)

    f2 = open('/proc/stat', 'r')
    try:
        values2 = f2.readline().split()
        total_time2 = 0
        for i in values2[1:]:
            total_time2 += int(i)
        idle_time2 = int(values2[4])
        iowait_time2 = int(values2[5])
    finally:
        f2.close()
    idle_time = idle_time2 - idle_time1
    iowait_time = iowait_time2 - iowait_time1
    total_time = total_time2 - total_time1

    cpu_percentage = int(100.0 * (total_time - idle_time - iowait_time) / total_time)
    # compensate logic
    if total_time < 0 or idle_time < 0 or iowait_time < 0 or cpu_percentage < 0 or cpu_percentage > 100:
        time.sleep(1)
        f3 = open('/proc/stat', 'r')
        try:
            values3 = f3.readline().split()
            total_time3 = 0
            for i in values3[1:]:
                total_time3 += int(i)
            idle_time3 = int(values3[4])
            iowait_time3 = int(values3[5])
        finally:
            f3.close()
        idle_time = idle_time3 - idle_time2
        iowait_time = iowait_time3 - iowait_time2
        total_time = total_time3 - total_time2
        cpu_percentage = int(100.0 * (total_time - idle_time - iowait_time) / total_time)

    copyfile('/proc/stat', '/tmp/cpu_stat')
    return cpu_percentage

def network_io_kbitps():
    """Return network I/O statistics for every network interface
    installed on the system as a dict of raw tuples.
    """
    f1 = open("/proc/net/dev", "r")
    try:
        lines1 = f1.readlines()
    finally:
        f1.close()

    retdict1 = {}
    for line1 in lines1[2:]:
        colon1 = line1.find(':')
        assert colon1 > 0, line1
        name1 = line1[:colon1].strip()
        fields1 = line1[colon1 + 1:].strip().split()
        bytes_recv1 = float('%.4f' % (float(fields1[0]) * 0.0078125))
        bytes_sent1 = float('%.4f' % (float(fields1[8]) * 0.0078125))
        retdict1[name1] = (bytes_recv1, bytes_sent1)
    time.sleep(1)
    f2 = open("/proc/net/dev", "r")
    try:
        lines2 = f2.readlines()
    finally:
        f2.close()
    retdict2 = {}
    for line2 in lines2[2:]:
        colon2 = line2.find(':')
        assert colon2 > 0, line2
        name2 = line2[:colon2].strip()
        fields2 = line2[colon2 + 1:].strip().split()
        bytes_recv2 = float('%.4f' % (float(fields2[0]) * 0.0078125))
        bytes_sent2 = float('%.4f' % (float(fields2[8]) * 0.0078125))
        retdict2[name2] = (bytes_recv2, bytes_sent2)
    retdict = merge_with(retdict2, retdict1)
    return retdict

def disk_io_Kbps():
    iostat = Popen("iostat -d -k 1 2 | sed '/Device\|Linux\|^$/d' > /tmp/disk_io", shell=True, stdout=PIPE, stderr=PIPE)
    iostat_error = iostat.communicate()[1].strip()
    if iostat_error:
        logger.error("iostat not exists, %s" % iostat_error)
        return None

    retdict = {}
    exception = None 
    try:
        try:
            f = open('/tmp/disk_io', 'r')
        except Exception, ex:
            exception = ex
            logger.error(exception)
        if exception:
            return None
        lines = f.readlines()
        for line in lines:
            name, _, readkps, writekps, _, _, = line.split()
            if name:
                readkps = float(readkps)
                writekps = float(writekps)
                retdict[name] = (readkps, writekps)
        return retdict
    finally:
        f.close()

def merge_with(d1, d2, fn=lambda x, y: tuple(map(operator.sub, x, y))):
    res = d1.copy() # "= dict(d1)" for lists of tuples
    for key, val in d2.iteritems(): # ".. in d2" for lists of tuples
        try:
            res[key] = fn(res[key], val)
        except KeyError:
            res[key] = val
    return res

def get_load():
    try:
        f = open('/proc/loadavg', 'r')
        tmp = f.readline().split()
        lavg_1 = float(tmp[0])
        lavg_5 = float(tmp[1])
        lavg_15 = float(tmp[2])
        f.close()
    except:
        return None
    return lavg_1, lavg_5, lavg_15

def get_tcp_status():
    check_cmd = "command -v ss"
    check_proc = Popen(check_cmd, shell=True, stdout=PIPE)
    ss = check_proc.communicate()[0].rstrip('\n')
    if ss:
        cmd = "ss -ant | awk '{if(NR != 1) print $1}' | awk '{state=$1;arr[state]++} END{for(i in arr){printf \"%s=%s \", i,arr[i]}}' | sed 's/-/_/g' | sed 's/ESTAB=/ESTABLISHED=/g' | sed 's/FIN_WAIT_/FIN_WAIT/g'"
    else:
        cmd = "netstat -anp | grep tcp | awk '{print $6}' | awk '{state=$1;arr[state]++} END{for(i in arr){printf \"%s=%s \", i,arr[i]}}' | tail -n 1"
    tcp_proc = Popen(cmd, shell=True, stdout=PIPE)
    tcp_status = tcp_proc.communicate()[0].rstrip('\n')
    return tcp_status

def get_proc_number():
    cmd = "ps axu | wc -l | tail -n 1"
    proc_func = Popen(cmd, shell=True, stdout=PIPE)
    proc_number = proc_func.communicate()[0].rstrip('\n')
    return proc_number

def all_index():
    return (
        int(time.time() * 1000),
        get_cpu_time(),
        get_mem_usage_percent(),
        check_disk(),
        disk_io_Kbps(),
        network_io_kbitps(),
        get_load(),
        get_tcp_status(),
        get_proc_number()
    )

def collector():
    timestamp, cpu, mem, disk, disk_io, net, load, tcp_status, process_number = all_index()
    disk_utilization = ''
    disk_io_read = ''
    disk_io_write = ''
    internet_networkrx = ''
    internet_networktx = ''
    tcp_status_count = ''
    period_1 = ''
    period_5 = ''
    period_15 = ''

    if UUID:
        cpu_utilization = 'vm.CPUUtilization ' + str(timestamp) + ' ' + str(cpu) + ' ns=ACS/ECS unit=Percent instanceId=%s\n' % UUID

        memory_utilization = 'vm.MemoryUtilization ' + str(timestamp) + ' ' + str(mem[0]) + ' ns=ACS/ECS unit=Percent instanceId=%s\n' % UUID

        if load:
            period_1 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[0]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=1min\n' % UUID
            period_5 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[1]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=5min\n' % UUID
            period_15 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[2]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=15min\n' % UUID

        if disk:
            for name, value in disk.items():
                disk_utilization = disk_utilization + 'vm.DiskUtilization ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Percent instanceId=%s mountpoint=%s\n' % (UUID, name)

        if disk_io:
            for name, value in disk_io.items():
                disk_io_read = disk_io_read + 'vm.DiskIORead ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobytes/Second instanceId=%s diskname=%s\n' % (UUID, name)
                disk_io_write = disk_io_write + 'vm.DiskIOWrite ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobytes/Second instanceId=%s diskname=%s\n' % (UUID, name)

        for name, value in net.items():
            internet_networkrx = internet_networkrx + 'vm.InternetNetworkRX ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobits/Second instanceId=%s netname=%s\n' % (UUID, name)
            internet_networktx = internet_networktx + 'vm.InternetNetworkTX ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobits/Second instanceId=%s netname=%s\n' % (UUID, name)

        if tcp_status:
            status_count = tcp_status.split()
            for element in status_count:
                key_value = element.split('=')
                tcp_status_count = tcp_status_count + 'vm.TcpCount ' + str(timestamp) + ' ' + key_value[1] + ' ns=ACS/ECS unit=Count instanceId=%s state=%s\n' % (UUID, key_value[0])

        process_count = 'vm.ProcessCount ' + str(timestamp) + ' ' + process_number + ' ns=ACS/ECS unit=Count instanceId=%s\n' % UUID
    else:
        cpu_utilization = 'vm.CPUUtilization ' + str(timestamp) + ' ' + str(cpu) + ' ns=ACS/ECS unit=Percent\n'

        memory_utilization = 'vm.MemoryUtilization ' + str(timestamp) + ' ' + str(mem[0]) + ' ns=ACS/ECS unit=Percent\n'

        if load:
            period_1 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[0]) + ' ns=ACS/ECS unit=count period=1min\n'
            period_5 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[1]) + ' ns=ACS/ECS unit=count period=5min\n'
            period_15 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[2]) + ' ns=ACS/ECS unit=count period=15min\n'

        if disk:
            for name, value in disk.items():
                disk_utilization = disk_utilization + 'vm.DiskUtilization ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Percent mountpoint=%s\n' % name

        if disk_io:
            for name, value in disk_io.items():
                disk_io_read = disk_io_read + 'vm.DiskIORead ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobytes/Second diskname=%s\n' % name
                disk_io_write = disk_io_write + 'vm.DiskIOWrite ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobytes/Second diskname=%s\n' % name

        for name, value in net.items():
            internet_networkrx = internet_networkrx + 'vm.InternetNetworkRX ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobits/Second netname=%s\n' % name
            internet_networktx = internet_networktx + 'vm.InternetNetworkTX ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobits/Second netname=%s\n' % name

        if tcp_status:
            status_count = tcp_status.split()
            for element in status_count:
                key_value = element.split('=')
                tcp_status_count = tcp_status_count + 'vm.TcpCount ' + str(timestamp) + ' ' + key_value[1] + ' ns=ACS/ECS unit=Count state=%s\n' % key_value[0]

        process_count = 'vm.ProcessCount ' + str(timestamp) + ' ' + process_number + ' ns=ACS/ECS unit=Count\n'

    data_post = cpu_utilization + memory_utilization + period_1 + period_5 + period_15 + disk_utilization + disk_io_read + disk_io_write + internet_networkrx + internet_networktx + tcp_status_count + process_count
    print data_post
    interval = random.randint(0, 5000)
    time.sleep(interval / 1000.0)

    headers = {"Content-Type": "text/plain", "Accept": "text/plain"}
    exception = None
    http_client = None
    try:
        try:
            http_client = httplib.HTTPConnection(REMOTE_HOST, REMOTE_PORT)
            http_client.request(method="POST", url=REMOTE_MONITOR_URI, body=data_post, headers=headers)
            response = http_client.getresponse()
            if response.status == 200:
                return
            else:
                logger.warn("response code %d" % response.status)
                logger.warn("response code %s" % response.read())
        except Exception, ex:
            exception = ex
    finally:
        if http_client:
            http_client.close()
        if exception:
            logger.error(exception)

if __name__ == '__main__':
    REMOTE_HOST = 'open.cms.aliyun.com'
    REMOTE_PORT = 80

    # get report address
    if not os.path.isfile("../cmscfg"):
        pass
    else:
        props = {}
        prop_file = file("../cmscfg", 'r')
        for line in prop_file.readlines():
            kv = line.split('=')
            props[kv[0].strip()] = kv[1].strip()
        prop_file.close()
        if props.get('report_domain'):
            REMOTE_HOST = props.get('report_domain')
        if props.get('report_port'):
            REMOTE_PORT = props.get('report_port')

    # get uuid
    if not os.path.isfile("../aegis_quartz/conf/uuid"):
        pass
    else:
        uuid_file = file("../aegis_quartz/conf/uuid", 'r')
        UUID = uuid_file.readline()
        UUID = UUID.lower()

    REMOTE_MONITOR_URI = "/metrics/putLines"
    MONITOR_DATA_FILE_DIR = "/tmp"
    LOG_FILE = "/tmp/" + "vm.log"
    LOG_LEVEL = logging.INFO
    LOG_FILE_MAX_BYTES = 1024 * 1024
    LOG_FILE_MAX_COUNT = 3
    logger = logging.getLogger('sampler')
    logger.setLevel(LOG_LEVEL)
    handler = RotatingFileHandler(filename=LOG_FILE, mode='a', maxBytes=LOG_FILE_MAX_BYTES,
                                  backupCount=LOG_FILE_MAX_COUNT)
    formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    socket.setdefaulttimeout(10)

    try:
        collector()
    except Exception, e:
        logger.error(e)
        sys.exit(1)
相关标签: python linux