diff --git a/conf_parser.py b/conf_parser.py new file mode 100644 index 0000000..cb1c055 --- /dev/null +++ b/conf_parser.py @@ -0,0 +1,50 @@ +#! /usr/bin/python + +from configobj import ConfigObj,ConfigObjError + + +class ConfigParser: + def __init__(self): + self.clusters =[] + + def parse (self,config_file): + try: + config = ConfigObj(config_file) + + #get gmetad host information and nagios checkresult directory + self.gmetad_host = config.pop('gmetad_host') + self.gmetad_port = config.pop('gmetad_port') + self.force_dmax = config.pop('force_dmax') + self.tmax_grace = config.pop('tmax_grace') + self.strip_domains = config.pop('strip_domains') + self.nagios_result_dir = config.pop('nagios_result_dir') + + + for cluster_name in config.keys(): + cluster_hosts = {} + #get hosts in the cluster + for host_name in config[cluster_name].keys(): + metrics = [] + #collect metric for each host in the cluster + for metric_name in config[cluster_name][host_name].keys(): + metric_def = {} + metric_def['service_name'] = config[cluster_name][host_name][metric_name]['service_name'] + if 'crit_above' in config[cluster_name][host_name][metric_name].keys(): + metric_def['crit_above'] = config[cluster_name][host_name][metric_name]['crit_above'] + metric_def['crit_below'] = None + if 'crit_below' in config[cluster_name][host_name][metric_name].keys(): + metric_def['crit_below'] = config[cluster_name][host_name][metric_name]['crit_below'] + metric_def['crit_above'] = None + if 'warn_above' in config[cluster_name][host_name][metric_name].keys(): + metric_def['warn_above'] = config[cluster_name][host_name][metric_name]['warn_above'] + metric_def['warn_below'] = None + if 'warn_below' in config[cluster_name][host_name][metric_name].keys(): + metric_def['warn_below'] = config[cluster_name][host_name][metric_name]['warn_below'] + metric_def['warn_above'] = None + metrics.append((metric_name,metric_def)) + for host in host_name.split(','): + cluster_hosts.setdefault(host.lstrip(), []).append(metrics) + self.clusters.append((cluster_name,cluster_hosts)) + + except (ConfigObjError, IOError), e: + print 'Could not read %s' % (e) diff --git a/ganglia-nagios-bridge.conf b/ganglia-nagios-bridge.conf new file mode 100755 index 0000000..e08b137 --- /dev/null +++ b/ganglia-nagios-bridge.conf @@ -0,0 +1,87 @@ +gmetad_host = '127.0.0.1' +gmetad_port = 8649 +# This overrides the DMAX attribute from all metrics in all hosts +# If DMAX > 0 and TN > DMAX, then a metric state is considered +# UNKNOWN and Nagios will potentially send an alert +force_dmax = 0 + +# Every collection group in gmond.conf defines a time_threshold +# This value appears as TMAX in the XML. +# The gmond process should normally send every metric again before +# the value timer TN > TMAX. +# If ganglia-nagios-bridge is polling a gmond collector +# then a very small tmax_grace period (perhaps 5 seconds) is used. +# If ganglia-nagios-bridge is polling a gmetad server then +# tmax_grace should be set higher than the polling interval configured +# in gmetad. +tmax_grace = 30 + +# Ganglia XML typically contains FQDNs for all hosts, as it obtains +# the hostnames using reverse DNS lookups. Nagios, on the other hand, +# is often configured with just the hostname and no domain. Setting +# strip_domains = True will ensure that the domain part is stripped from +# the hostname before passing it to Nagios. +strip_domains = True + +# This is the directory where Nagios expects to read checkresults +# submitted in batch +nagios_result_dir = '/var/lib/nagios3/spool/checkresults' + +# This is where we select the metrics that we want to map from +# Ganglia to Nagios service names +# Any metric not matched in the configuration will be ignored and +# not passed to Nagios. +# Defintion for multiple clusters and their hosts to be monitored +# along with their metrics is added in a nested format +# +# Format overview : +# cluster definiton specifying the clustername and hostnames and +# associated metrics to be monitored +# +# can add mutliple cluster names +# [cluster_name] +# Add comma separated host name(s) and and the common metrics to be monitored +# [[hostname(s) separated by ,]] +# Metric name of the metric to be monitored for the hostnames +# [[[metric name]]] +# metric attributes: corresponding service name and threshold values +# service_name = +# warn_above/below = +# crit_above/below = + +# Sample configuration + +[cluster_name] + [[host_01, host_02]] + [[[proc_total]]] + service_name = Total Processes + warn_above = 180 + crit_above = 200 + [[[load_one]]] + service_name = Current Load + warn_above = 0.1 + crit_above = 0.3 + [[host_02]] + [[[cpu_idle]]] + service_name = CPU IDLE + warn_above = 85 + crit_above = 90 + [[[disk_free]]] + service_name = DISK FREE + warn_below = 5 + crit_below = 2 + [[[cpu_speed]]] + service_name = CPU SPEED + warn_below = 2112 + crit_below = 2000 +[Production1] + [[host3, host12]] + [[[disk_free]]] + service_name = DISK FREE + warn_below = 10 + crit_below = 5 + [[host1]] + [[[cpu_speed]]] + service_name = CPU SPEED + warn_below = 2000 + crit_below = 1890 diff --git a/ganglia-nagios-bridge.py b/ganglia-nagios-bridge.py old mode 100755 new mode 100644 index 3e99d41..36f335f --- a/ganglia-nagios-bridge.py +++ b/ganglia-nagios-bridge.py @@ -22,94 +22,81 @@ ############################################################################ import argparse -import os import re import socket -import tempfile -import time import xml.sax +import time +import nagios_checkresult +import conf_parser +from pynag import Model # wrapper class so that the SAX parser can process data from a network # socket class SocketInputSource: def __init__(self, socket): self.socket = socket - + def getByteStream(self): return self - + def read(self, buf_size): return self.socket.recv(buf_size) -# interprets metric values to generate Nagios passive notifications + +# interprets metric values to generate service return codes class PassiveGenerator: def __init__(self, force_dmax, tmax_grace): self.force_dmax = force_dmax self.tmax_grace = tmax_grace - - # Nagios is quite fussy about the filename, it must be - # a 7 character name starting with 'c' - tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) - self.fh = tmp_file[0] - self.cmd_file = tmp_file[1] - os.write(self.fh, "### Active Check Result File ###\n") - os.write(self.fh, "file_time=" + str(int(time.time())) + "\n") - - def done(self): - os.close(self.fh) - ok_filename = self.cmd_file + ".ok" - ok_fh = file(ok_filename, 'a') - ok_fh.close() - - def process(self, metric_def, service_name, host, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen): + + def process(self, metric_def, metric_value, metric_tn, metric_tmax, metric_dmax): effective_dmax = metric_dmax if(self.force_dmax > 0): effective_dmax = force_dmax effective_tmax = metric_tmax + self.tmax_grace if effective_dmax > 0 and metric_tn > effective_dmax: - service_state = 3 + service_return_code = 3 elif metric_tn > effective_tmax: - service_state = 3 + service_return_code = 3 elif isinstance(metric_value, str): - service_state = 0 - elif 'crit_below' in metric_def and metric_value < metric_def['crit_below']: - service_state = 2 - elif 'warn_below' in metric_def and metric_value < metric_def['warn_below']: - service_state = 1 - elif 'crit_above' in metric_def and metric_value > metric_def['crit_above']: - service_state = 2 - elif 'warn_above' in metric_def and metric_value > metric_def['warn_above']: - service_state = 1 + service_return_code = 0 + elif metric_def['crit_below'] is not None and metric_value < float(metric_def['crit_below']): + service_return_code = 2 + elif metric_def['warn_below'] is not None and metric_value < float(metric_def['warn_below']): + service_return_code = 1 + elif metric_def['crit_above'] is not None and metric_value > float(metric_def['crit_above']): + service_return_code = 2 + elif metric_def['warn_above'] is not None and metric_value > float(metric_def['warn_above']): + service_return_code = 1 else: - service_state = 0 - #cmd = "[" + str(int(time.time())) + "] PROCESS_SERVICE_CHECK_RESULT;" + host + ";" + service_name + ";" + str(service_state) + ";Value = " + str(metric_value) - #os.write(self.fh, cmd + "\n") - os.write(self.fh, "\n### Nagios Service Check Result ###\n") - os.write(self.fh, "# Time: " + time.asctime() + "\n") - os.write(self.fh, "host_name=" + host + "\n") - os.write(self.fh, "service_description=" + service_name + "\n") - os.write(self.fh, "check_type=0\n") - os.write(self.fh, "check_options=0\n") - os.write(self.fh, "scheduled_check=1\n") - os.write(self.fh, "reschedule_check=1\n") - os.write(self.fh, "latency=0.1\n") - os.write(self.fh, "start_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "finish_time=" + str(last_seen) + ".0\n") - os.write(self.fh, "early_timeout=0\n") - os.write(self.fh, "exited_ok=1\n") - os.write(self.fh, "return_code=" + str(service_state) + "\n") - os.write(self.fh, "output=" + service_name + " " + str(metric_value) + "\\n\n") - #os.write(self.fh, "\n") + service_return_code = 0 + return service_return_code + +# gets the hosts and services Nagios knows about +class NagiosHosts: + def __init__(self): + self.host_service = [] + + def process(self): + all_hosts = Model.Host.objects.all + for host in all_hosts: + service_name = [] + for service in host.get_effective_services(): + service_name.append(service.service_description) + self.host_service.append((host.host_name, service_name)) # SAX event handler for parsing the Ganglia XML stream class GangliaHandler(xml.sax.ContentHandler): - def __init__(self, clusters_c, value_handler): + def __init__(self, clusters_c, value_handler, checkresult_file_handler, strip_domains, nagios_hosts): self.clusters_c = clusters_c self.value_handler = value_handler + self.checkresult_file_handler = checkresult_file_handler self.clusters_cache = {} self.hosts_cache = {} self.metrics_cache = {} + self.strip_domains = strip_domains + self.host_service = nagios_hosts.host_service def startElement(self, name, attrs): @@ -134,36 +121,44 @@ def startElement(self, name, attrs): self.handle_metric(metric_name, service_name, attrs) return for idx, metric_def in enumerate(self.metrics): - match_result = metric_def[0].match(metric_name) + match_result = metric_def[0] == metric_name if match_result: - service_name_tmpl = metric_def[1]['service_name'] - if len(match_result.groups()) > 0: - service_name = match_result.expand(service_name_tmpl) - else: - service_name = service_name_tmpl - self.metrics_cache[cache_key] = (idx, service_name) - self.metric = metric_def[1] - self.handle_metric(metric_name, service_name, attrs) - return + service_name = metric_def[1]['service_name'] + # if service is defined in Nagios for host_name + if service_name in self.nagios_service: + self.metrics_cache[cache_key] = (idx, service_name) + self.metric = metric_def[1] + self.handle_metric(metric_name, service_name, attrs) + return # handle a HOST element in the XML - if name == "HOST" and self.hosts is not None: + if name == "HOST": self.metrics = None - self.host_name = attrs['NAME'] - self.host_reported = long(attrs['REPORTED']) - if strip_domains: - self.host_name = self.host_name.partition('.')[0] - cache_key = (self.cluster_idx, self.host_name) - if cache_key in self.hosts_cache: - self.host_idx = self.hosts_cache[cache_key] - self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] - return - for idx, host_def in enumerate(self.hosts): - if host_def[0].match(self.host_name): - self.hosts_cache[cache_key] = idx - self.host_idx = idx - self.metrics = host_def[1] + if self.hosts is not None: + self.host_name = attrs['NAME'] + self.host_reported = long(attrs['REPORTED']) + self.nagios_service = None + if self.strip_domains: + self.host_name = self.host_name.partition('.')[0] + cache_key = (self.cluster_idx, self.host_name) + if cache_key in self.hosts_cache: + self.host_ix = self.hosts_cache[cache_key] + self.metrics = self.clusters_c[self.cluster_idx][1][self.host_idx][1] + self.handle_host(host_name, attrs) return + for idx, host_def in enumerate(self.hosts): + if host_def[0] == self.host_name: + for host in self.host_service: + if host[0] == self.host_name: + self.hosts_cache[cache_key] = idx + self.host_idx = idx + self.metrics = [] + for metric_tuple in host_def[1]: + self.metrics += metric_tuple + self.handle_host(self.host_name, attrs) + # get the services defined for the host in Nagios + self.nagios_service = host[1] + return # handle a CLUSTER element in the XML if name == "CLUSTER": @@ -175,12 +170,28 @@ def startElement(self, name, attrs): self.hosts = self.clusters_c[self.cluster_idx][1] return for idx, cluster_def in enumerate(self.clusters_c): - if cluster_def[0].match(self.cluster_name): + if cluster_def[0] == self.cluster_name: self.clusters_cache[self.cluster_name] = idx self.cluster_idx = idx - self.hosts = cluster_def[1] + self.hosts = [] + for host_name in cluster_def[1]: + self.hosts.append((host_name, cluster_def[1][host_name])) return + # checks the state of host by comparing tmax and tn for the host + def handle_host(self, host_name, attrs): + host_tn = int(attrs['TN']) + host_tmax = int(attrs['TMAX']) + last_seen = self.cluster_localtime - host_tn + if host_tn > host_tmax*4 : + host_return_code = 1 #host down + else: + host_return_code = 0 #host up + host_last_seen = str(last_seen) + '.0' + + # write host checks to Nagios checkresult file + self.checkresult_file_handler.build_host(time.asctime(), self.host_name, 0, 0, 1, 1, 0.1, host_last_seen, host_last_seen, 0, 1, host_return_code,"") + def handle_metric(self, metric_name, service_name, attrs): # extract the metric attributes metric_value_raw = attrs['VAL'] @@ -188,7 +199,8 @@ def handle_metric(self, metric_name, service_name, attrs): metric_tmax = int(attrs['TMAX']) metric_dmax = int(attrs['DMAX']) metric_type = attrs['TYPE'] - # they metric_value has a dynamic type: + metric_units = attrs['UNITS'] + # the metric_value has a dynamic type: if metric_type == 'string': metric_value = metric_value_raw elif metric_type == 'double' or metric_type == 'float': @@ -196,8 +208,16 @@ def handle_metric(self, metric_name, service_name, attrs): else: metric_value = int(metric_value_raw) last_seen = self.cluster_localtime - metric_tn - # call the handler to process the value: - self.value_handler.process(self.metric, service_name, self.host_name, metric_name, metric_value, metric_tn, metric_tmax, metric_dmax, last_seen) + service_last_seen = str(last_seen) + '.0' + + #setting service return code as 0 by default + service_return_code=0 + # call the handler to process the value and return service state after comparing metric value and threshold: + service_return_code = self.value_handler.process(self.metric, metric_value, metric_tn, metric_tmax, metric_dmax) + # write Passive service checks to checkresult file + self.checkresult_file_handler.build_service(time.asctime(), self.host_name, service_name, 0, 0, 1, 1, 0.1, service_last_seen, service_last_seen, 0, 1, service_return_code, metric_value, metric_units,"") + + # main program code if __name__ == '__main__': @@ -205,43 +225,41 @@ def handle_metric(self, metric_name, service_name, attrs): # parse command line parser = argparse.ArgumentParser(description='read Ganglia XML and generate Nagios check results file') parser.add_argument('config_file', nargs='?', - help='configuration file', default='/etc/ganglia/nagios-bridge.conf') + help='configuration file', default='/etc/ganglia/ganglia-nagios-bridge.conf') args = parser.parse_args() # read the configuration file, setting some defaults first force_dmax = 0 tmax_grace = 60 - execfile(args.config_file) - - # compile the regular expressions - clusters_c = [] - for cluster_def in clusters: - cluster_c = re.compile(cluster_def[0]) - hosts = [] - for host_def in cluster_def[1]: - host_c = re.compile(host_def[0]) - metrics = [] - for metric_def in host_def[1]: - metric_c = re.compile(metric_def[0]) - metrics.append((metric_c, metric_def[1])) - hosts.append((host_c, metrics)) - clusters_c.append((cluster_c, hosts)) + #pasre config file + config_parse = conf_parser.ConfigParser() + config_parse.parse(args.config_file) + + #get hosts and associated services known to Nagios to prevent generating checkresult for hosts not known to Nagios + nagios_hosts = NagiosHosts() + nagios_hosts.process() # connect to the gmetad or gmond - sock = socket.create_connection((gmetad_host, gmetad_port)) + sock = socket.create_connection((config_parse.gmetad_host, config_parse.gmetad_port)) # set up the SAX parser parser = xml.sax.make_parser() pg = PassiveGenerator(force_dmax, tmax_grace) - parser.setContentHandler(GangliaHandler(clusters_c, pg)) - # run the main program loop - parser.parse(SocketInputSource(sock)) + #Instantiate GenerateNagiosCheckResult class + gn = nagios_checkresult.GenerateNagiosCheckResult() + #Create CheckResultFile + try: + gn.create(config_parse.nagios_result_dir, int(time.time())) + parser.setContentHandler(GangliaHandler(config_parse.clusters, pg, gn, config_parse.strip_domains, nagios_hosts)) + # run the main program loop + parser.parse(SocketInputSource(sock)) + + # write out for Nagios + gn.submit() - # write out for Nagios - pg.done() + # all done + sock.close() + except OSError as e: + print "Failed to create tempfile at", config_parse.nagios_result_dir - # all done - sock.close() except socket.error as e: logging.warn('Failed to connect to gmetad: %s', e.strerror) - - diff --git a/nagios_checkresult.py b/nagios_checkresult.py new file mode 100644 index 0000000..09b41c8 --- /dev/null +++ b/nagios_checkresult.py @@ -0,0 +1,92 @@ +#!/usr/bin/python +# +# NagiosCheckResult- Class that creates Nagios checkresult file and +# writes Passive Host and Service checks to it +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +########################################################################### + +import os +import tempfile +import sys + + +class GenerateNagiosCheckResult: + + def __init__(self): + self.service_state = {0: 'OK', 1: 'WARNING', 2: 'CRITICAL', 3: 'UNKNOWN'} + self.host_state = {0: 'UP', 1: 'DOWN', 2: 'DOWN', 3: 'DOWN'} + + # Creates a checkresult file + def create(self, nagios_result_dir, file_time): + # Nagios is quite fussy about the filename, it must be + # a 7 character name starting with 'c' + tmp_file = tempfile.mkstemp(prefix='c',dir=nagios_result_dir) # specifies name and directory, check tempfile thoroughly + self.fh = tmp_file[0] + self.cmd_file = tmp_file[1] + os.write(self.fh, "### Active Check Result File ###\n") + os.write(self.fh, "file_time=" + str(file_time) + "\n") + + # Accepts parameters required for the host checkresult + # Writes host checks to checkresult file + def build_host(self, checkresult_time, host, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, host_return_code, output_string): + os.write(self.fh, "\n### Nagios Host Check Result ###\n") + os.write(self.fh, "# Time: " + checkresult_time + "\n") + os.write(self.fh, "host_name=" + host + "\n") + os.write(self.fh, "check_type=" + str(check_type) + "\n") + os.write(self.fh, "check_options=" + str(check_options) + "\n") + os.write(self.fh, "scheduled_check=" + str(scheduled_check) + "\n") + os.write(self.fh, "reschedule_check=" + str(reschedule_check) + "\n") + os.write(self.fh, "latency=" + str(latency) + "\n") + os.write(self.fh, "start_time=" + str(start_time) + "\n") + os.write(self.fh, "finish_time=" + str(finish_time) + "\n") + os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") + os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") + os.write(self.fh, "return_code=" + str(host_return_code) + "\n") + if not output_string: + os.write(self.fh, "output=" + " " + "Host (" + host + ")" + " " + self.host_state[host_return_code] + "\\n\n") + else: + os.write(self.fh, "output=" + " " + output_string + "\\n\n") + + # Accepts parameters required for the service checkresult + # Writes service checks to the checkresult file + def build_service(self, checkresult_time, host, service_name, check_type, check_options, scheduled_check, reschedule_check, latency, start_time, finish_time, early_timeout, exited_ok, service_return_code, metric_value, metric_units, output_string): + os.write(self.fh, "\n### Nagios Service Check Result ###\n") + os.write(self.fh, "# Time: " + checkresult_time + "\n") + os.write(self.fh, "host_name=" + host + "\n") + os.write(self.fh, "service_description=" + service_name + "\n") + os.write(self.fh, "check_type=" + str(check_type) + "\n") + os.write(self.fh, "check_options=" + str(check_options) + "\n") + os.write(self.fh, "scheduled_check=" + str(scheduled_check) + "\n") + os.write(self.fh, "reschedule_check=" + str(reschedule_check) + "\n") + os.write(self.fh, "latency=" + str(latency) + "\n") + os.write(self.fh, "start_time=" + str(start_time) + "\n") + os.write(self.fh, "finish_time=" + str(finish_time) + "\n") + os.write(self.fh, "early_timeout=" + str(early_timeout) + "\n") + os.write(self.fh, "exited_ok=" + str(exited_ok) + "\n") + os.write(self.fh, "return_code=" + str(service_return_code) + "\n") + if not output_string: + os.write(self.fh, "output=" + service_name + " " + self.service_state[service_return_code] + "- " + service_name + " " + str(metric_value) + " " + metric_units + "\\n\n") + else: + os.write(self.fh, "output=" + " " + output_string + "\\n\n") + + # Close the file handle and create an ok-to-go indicator file + def submit(self): + os.close(self.fh) + ok_filename = self.cmd_file + ".ok" + ok_fh = file(ok_filename, 'a') + ok_fh.close() + return self.cmd_file + diff --git a/test_nagios_checkresult.py b/test_nagios_checkresult.py new file mode 100644 index 0000000..47ff5a2 --- /dev/null +++ b/test_nagios_checkresult.py @@ -0,0 +1,60 @@ +#! /usr/bin/python + +import unittest +import time +import textwrap +import nagios_checkresult + +class TestNagiosCheckResult(unittest.TestCase): + def setUp (self): + self.maxDiff = None + #this is how checkresult file should look like + self.checkresult = textwrap.dedent("""\ + ### Active Check Result File ### + file_time=1400347643.73 + + ### Nagios Host Check Result ### + # Time: Sat May 17 22:57:23 2014 + host_name=xyz + check_type=0 + check_options=0 + scheduled_check=1 + reschedule_check=1 + latency=0.1 + start_time=1399732963.0 + finish_time=1399732963.0 + early_timeout=0 + exited_ok=1 + return_code=0 + output= Host (xyz) UP\\n + + ### Nagios Service Check Result ### + # Time: Sat May 17 22:57:23 2014 + host_name=xyz + service_description=Total processes + check_type=0 + check_options=0 + scheduled_check=1 + reschedule_check=1 + latency=0.1 + start_time=1399732963.0 + finish_time=1399732963.0 + early_timeout=0 + exited_ok=1 + return_code=0 + output=Total processes OK- Total processes 288 \\n\n""") + + def test_checkresult(self): + #generate checkresult file by sending data to GenerateNagiosCheckResult + ng = nagios_checkresult.GenerateNagiosCheckResult() + ng.create('/var/lib/nagios3/spool/checkresults', 1400347643.73) + ng.build_host('Sat May 17 22:57:23 2014', 'xyz', 0, 0, 1, 1, 0.1, str(1399732963.0), str(1399732963.0), 0, 1, 0, "") + ng.build_service('Sat May 17 22:57:23 2014', 'xyz', 'Total processes', 0, 0, 1, 1, 0.1, str(1399732963.0), str(1399732963.0), 0, 1, 0, 288, "", "") + #fname is the name of checkresult file generated + fname = ng.submit() + self.testfile = open(fname).read() + #compare the expected checkresult file with generated checkresult file + self.assertMultiLineEqual(self.testfile, self.checkresult, msg=None) + +if __name__ == '__main__': + unittest.main()