[PATCH] add service_check_timeout_state configuration variable

Bill McGonigle bill at bfccomputing.com
Tue Feb 9 19:34:36 CET 2010


Hi, all,

This patch adds a variable called 'service_check_timeout_state' which allows the admin to define the state that is returned when a service check times out.

I look after a handful of nagios installations and the #1 complaint is of 'false alarms', which typically result from the machine that nagios is running on getting bogged down by some unrelated process (backups, etc., nagios doesn't usually get its own machine in a small business) and thus a 'critical' state is thrown, and too often everybody gets paged in the middle of the night (we page on critical).

Nagios has had the #ifdef SERVICE_CHECK_TIMEOUTS_RETURN_UNKNOWN available for re-compiling, which works, but then those users are unable to keep up with their distro's updates and it may be beyond the skill of many.

This patch moves that idea into a variable, allows any of four states to be chosen ('critical' remaining the default), and does away with the #ifdef (which should be obsolete now).

I've been running with my in-house nagios set to 'u', and so far no late-nite false alarms, though I can't say it's had extensive field testing.  This is also the first time I've done any nagios hacking (though I don't do much in c these days, the code was very easy to follow - kudos).

Here's some suggested text for the sample config file:

---8<---8<---8<----

# SERVICE CHECK TIMEOUT STATE
# This setting determines the state Nagios will report when a
# service check times out - that is does not respond within
# service_check_timeout seconds.  This can be useful if a
# machine is running at too high a load and you do not want
# to consider a failed service check to be critical (the default).
# Valid settings are:
# c - Critical (default)
# u - Unknown
# w - Warning
# o - OK

service_check_timeout_state=c

---8<---8<---8<----

and the patch follows (the format I have in my rpm file, not sure how to use git yet).

Thanks,
-Bill


---8<---8<---8<----

diff -ur nagios-3.2.0/base/config.c nagios-3.2.0-bfc/base/config.c
--- nagios-3.2.0/base/config.c	2009-05-17 08:54:28.000000000 -0400
+++ nagios-3.2.0-bfc/base/config.c	2010-02-08 18:47:21.000000000 -0500
@@ -73,6 +73,7 @@
 extern int      log_passive_checks;
 
 extern int      service_check_timeout;
+extern int      service_check_timeout_state;
 extern int      host_check_timeout;
 extern int      event_handler_timeout;
 extern int      notification_timeout;
@@ -722,6 +723,23 @@
 				break;
 			        }
 		        }
+		
+		else if(!strcmp(variable,"service_check_timeout_state")){
+
+			if(!strcmp(value,"o"))
+			        service_check_timeout_state=STATE_OK;
+			else if(!strcmp(value,"w"))
+				service_check_timeout_state=STATE_WARNING;
+			else if(!strcmp(value,"c"))
+				service_check_timeout_state=STATE_CRITICAL;
+			else if(!strcmp(value,"u"))
+				service_check_timeout_state=STATE_UNKNOWN;
+			else{
+				asprintf(&error_message,"Illegal value for service_check_timeout_state");
+				error=TRUE;
+				break;
+			        }
+		        }
 
 		else if(!strcmp(variable,"host_check_timeout")){
 
diff -ur nagios-3.2.0/base/nagios.c nagios-3.2.0-bfc/base/nagios.c
--- nagios-3.2.0/base/nagios.c	2009-08-12 14:28:10.000000000 -0400
+++ nagios-3.2.0-bfc/base/nagios.c	2010-02-08 18:36:34.000000000 -0500
@@ -100,6 +100,7 @@
 unsigned long   syslog_options=0;
 
 int             service_check_timeout=DEFAULT_SERVICE_CHECK_TIMEOUT;
+int             service_check_timeout_state=STATE_CRITICAL;
 int             host_check_timeout=DEFAULT_HOST_CHECK_TIMEOUT;
 int             event_handler_timeout=DEFAULT_EVENT_HANDLER_TIMEOUT;
 int             notification_timeout=DEFAULT_NOTIFICATION_TIMEOUT;
diff -ur nagios-3.2.0/base/utils.c nagios-3.2.0-bfc/base/utils.c
--- nagios-3.2.0/base/utils.c	2009-08-11 12:53:04.000000000 -0400
+++ nagios-3.2.0-bfc/base/utils.c	2010-02-08 18:48:31.000000000 -0500
@@ -115,6 +115,7 @@
 extern unsigned long      syslog_options;
 
 extern int      service_check_timeout;
+extern int      service_check_timeout_state;
 extern int      host_check_timeout;
 extern int      event_handler_timeout;
 extern int      notification_timeout;
@@ -1905,11 +1906,7 @@
 	/* get the current time */
 	gettimeofday(&end_time,NULL);
 
-#ifdef SERVICE_CHECK_TIMEOUTS_RETURN_UNKNOWN
-	check_result_info.return_code=STATE_UNKNOWN;
-#else
-	check_result_info.return_code=STATE_CRITICAL;
-#endif
+	check_result_info.return_code=service_check_timeout_state;
 	check_result_info.finish_time=end_time;
 	check_result_info.early_timeout=TRUE;
 
---8<---8<---8<----


-- 
Bill McGonigle, Owner   
BFC Computing, LLC       
http://bfccomputing.com/ 
Telephone: +1.603.448.4440
Email, IM, VOIP: bill at bfccomputing.com           
VCard: http://bfccomputing.com/vcard/bill.vcf
Social networks: bill_mcgonigle/bill.mcgonigle


------------------------------------------------------------------------------
The Planet: dedicated and managed hosting, cloud storage, colocation
Stay online with enterprise data centers and the best network in the business
Choose flexible plans and management services without long-term contracts
Personal 24x7 support from experience hosting pros just a phone call away.
http://p.sf.net/sfu/theplanet-com




More information about the Developers mailing list