Nagios + SMS = TRUE?

Mark Ferlatte ferlatte at cryptio.net
Fri Sep 26 23:56:22 CEST 2003


James Turnbull said on Fri, Sep 26, 2003 at 10:59:18AM +1000:
> >5 pages in 5 minutes; any more than that, we don't send them, and just send
> a
> ">Too many pages" page, which everyone understands to mean "Bad things have
> >happened".
> 
> This is a great idea - we have just started looking at this ourselves.
> Would you be adverse to sharing the script?

Not at all.  You'll have to hack it into shape for your installation; it
definately falls into the hack category.

If I get a chance, I'm going to clean it up so that I can use it in other
places, but that seems less likely with each day.  :)

M
-------------- next part --------------
#!/usr/bin/python

import getopt
import smtplib
import traceback
import os
import sys
import re
import time
from types import *

#####################################
#
# send_mail_throttled.py
#
#

# This script is a mailer script that "throttles" mail sent to devices
# which we don't want to flood with mail, such as cell phones.
#
# Here's the design:
#
# If we get 5 pages in a one minute window:
# - block all pages, accumulate to a web page
# - send a page saying that we're stopping pages
# - wait 5 minutes before clearing the history and restarting paging.

#
# We need to maintain a history on a per-e-mail level
#
# 
#

class WriteLog:
    def __init__(self, stream):
        self.stream = stream
        self.mLogFile = open("/var/tmp/spt_log","a")
    def write(self, text):
        self.mLogFile.write(text)
        self.stream.write(text)
    def flush(self):
        self.stream.flush()

sys.stdout = WriteLog(sys.stdout)


gMaxMessages = 5
gMinTime = 300
gBlockedTimeout = 300

def main():
    try:
        print "Time:",time.ctime(time.time())
        to_addr = 'root at localhost'
        subj = 'No subject'
        mesg = 'No message'

        options = ''
        long_options = ['to=','subj=', 'mesg=']

        opts, args = getopt.getopt(sys.argv[1:], options, long_options)

        print str(sys.argv)

        for o, a in opts:
            if o == '--to':
                to_addr = a
            elif o == '--subj':
                subj = a
            elif o == '--mesg':
                mesg = a
            else:
                raise "Unknown argument " + o

        hist_filename = "/var/tmp/%s.mailhist" % to_addr

        #
        # Clean up the subject and message to get rid of excess verbiage
        #
        subj = subj.replace("ACKNOWLEDGEMENT","ACK")
        subj = subj.replace("CRITICAL","CRIT")
        subj = subj.replace("WARNING","WARN")

        mesg = mesg.replace("\\n","\n")
        mesg = mesg.replace("ACKNOWLEDGEMENT","ACK")
        mesg = mesg.replace("CRITICAL","CRIT")
        mesg = mesg.replace("WARNING","WARN")

        # Check and see if a mail file exists
        if not os.path.exists(hist_filename):
            # We don't have a history file
            # Create one, and then go and send
            print "No history file",hist_filename+"!"
            print "Sending page!"
            send_mail_log("nagios at localhost",
                          to_addr,
                          subj,
                          mesg)
            return

        # We do have a history file
        # Read out the history...
        hist_file = open(hist_filename, 'r')
        lines = hist_file.readlines()

        mail_times = []

        block_time = 0
        time_re = re.compile("^(\d+)$")
        block_re = re.compile("^(\d+)Blocked")
        for line in lines:
            re_match = time_re.match(line)
            if re_match:
                mail_times.append(int(re_match.group(1)))
            else:
                # We've been blocked for pager volume
                re_match = block_re.match(line)
                if re_match:
                    print "Blocked!"
                    block_time = int(re_match.group(1))
        hist_file.close()

        block_age = int(time.time()) - block_time

        if block_time:
            print "Blocked", block_age, "seconds ago!"
            if block_age > gBlockedTimeout:
                # It's been long enough, start sending pages again
                print "Clearing block, starting mail again"
                mail_times = []
            else:
                # We need to store it on the web page
                print "Ignoring blocked message!"
                return


        mail_times.append(int(time.time()))
        mail_times = mail_times[-gMaxMessages:]

        print mail_times
        if len(mail_times) < gMaxMessages:
            print "Less than", gMaxMessages, "alerts, sending!"
            try:
                hist_file = open(hist_filename, 'w')
            except:
                hist_file = None
            if hist_file:
                for mail_time in mail_times[-gMaxMessages:-1]:
                    time_str = "%d\n" % mail_time
                    hist_file.write(time_str)
                hist_file.truncate()
                hist_file.close()
            send_mail_log("nagios at localhost",
                          to_addr,
                          subj,
                          mesg)
            return

        end_time = mail_times[-1]
        start_time = mail_times[0]
        total_time = end_time - start_time

        print gMaxMessages, "messages in", total_time, "seconds!"
        if total_time > gMinTime:
            hist_file = open(hist_filename, 'w')
            for mail_time in mail_times[-gMaxMessages:-1]:
                time_str = "%d\n" % mail_time
                hist_file.write(time_str)
            hist_file.truncate()
            hist_file.close()
            send_mail_log("nagios at localhost",
                          to_addr,
                          subj,
                          mesg)
            return

        # Uh oh, too many. Send out the warning alert and set
        # us into archive and wait mode.
        print "Too many messages!"
        hist_file = open(hist_filename, 'w')
        time_str = "%dBlocked\n" % int(time.time())
        hist_file.write(time_str)
        hist_file.truncate()
        hist_file.close()
        send_mail_log("nagios at localhost",
                      to_addr,
                      "",
                      "Too many pages, blocking for 5 minutes!")
    except:
        traceback.print_exc()
        fm_list = traceback.format_exception(sys.exc_type,
                                             sys.exc_value,
                                             sys.exc_traceback)


def send_mail_log(from_addr, to_addr, subject, body):
    send_mail(from_addr, to_addr, subject, body)

    hist_filename = "/var/tmp/%s.mailhist" % to_addr
    try:
        hist_file = open(hist_filename, 'a')
    except:
        hist_file = None
        
    if hist_file:
        time_str = "%d\n" % time.time()
        hist_file.write(time_str)
        hist_file.close()

def send_mail(from_addr, to_addrs, subject, body):
    """
    Send mail using SMTP
    """
    to_string = ''
    if type(to_addrs) is StringType:
        to_string = to_addrs
    elif type(to_addrs) is ListType:
        for count in range(0, len(to_addrs)):
            to_string += to_addrs[count]
            if count < len(to_addrs) - 1:
                to_string += ', '
                
    msg = "From: %(from_addr)s\r\nTo: %(to_string)s\r\n"\
          "Subject: %(subject)s\r\n\r\n" % vars()
    msg += body

    server = smtplib.SMTP('mail')
    server.sendmail(from_addr, to_addrs, msg)
    server.quit()


main()
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 187 bytes
Desc: not available
URL: <https://www.monitoring-lists.org/archive/users/attachments/20030926/2089e672/attachment.sig>


More information about the Users mailing list