Monitoring - 1000+ hosts and latency

David Parrish david at dparrish.com
Wed Aug 6 01:32:12 CEST 2003


I'm running a nagios install monitoring nearly 7000 services and still
growing. It was unbearable until it was patched to not fork to process
external check results as often.

If a large portion of your checks are passive, then this patch will help a
lot. Probably not much for active checks though.

My average check latency is 0.006 sec.

On Tue, Aug 05, 2003 at 12:23:25PM +0100, ftang at cgg.com wrote:
> From: ftang at cgg.com
> To: nagios-users at lists.sourceforge.net
> Subject: [Nagios-users] Monitoring - 1000+ hosts and latency
> Date: Tue, 5 Aug 2003 12:23:25 +0100
> 
> All,
> 
> Is there anyone currenly using nagios (any version) on over 1000 hosts
> (3000+ services)? I know that there have been emails about users
> experiencing extreme latencies, and you can imagine that I too am in the
> same crowd. I like nagios (used netsaint in my last company - only 15
> hosts), but I don''t think that it is suitable for my current setup. I hope
> I am wrong.
> 
> Thanks for your time.
> 
> Fin Tang
> Senior Support Specialist
> 

-- 
Regards,
David Parrish
0410 586 121
-------------- next part --------------
diff -ur nagios-1.1-virgin/base/nagios.c nagios-1.1/base/nagios.c
--- nagios-1.1-virgin/base/nagios.c	Thu Jul 10 08:30:10 2003
+++ nagios-1.1/base/nagios.c	Thu Jul 10 08:32:20 2003
@@ -51,6 +51,8 @@
 #include <getopt.h>
 #endif
 
+#include <time.h>
+
 /******** BEGIN EMBEDDED PERL INTERPRETER DECLARATIONS ********/
 
 #ifdef EMBEDDEDPERL 
@@ -1497,6 +1499,8 @@
 		printf("Current/Max Outstanding Checks: %d/%d\n",currently_running_service_checks,max_parallel_service_checks);
 #endif
 
+		check_for_external_commands();
+
 		/* handle high priority events */
 		if(event_list_high!=NULL && (current_time>=event_list_high->run_time)){
 
@@ -1594,8 +1598,13 @@
 			        }
 
 			/* wait a second so we don't hog the CPU... */
-			else
-				sleep((unsigned int)sleep_time);
+			else {
+				struct timespec t;
+				t.tv_sec = 0;
+				t.tv_nsec = 1000 * 1000 * 50;
+				nanosleep(&t, NULL);
+				//check_for_external_commands();
+				}
 		        }
 
 		/* we don't have anything to do at this moment in time */
@@ -1606,7 +1615,12 @@
 				check_for_external_commands();
 
 			/* wait a second so we don't hog the CPU... */
-			sleep((unsigned int)sleep_time);
+
+			{ struct timespec t;
+			t.tv_sec = 0;
+			t.tv_nsec = 1000 * 1000 * 50;
+			nanosleep(&t, NULL); }
+
 		        }
 	        }
 
--- commands.c.old	Sat Aug  2 14:13:23 2003
+++ nagios-1.1/base/commands.c	Sat Aug  2 14:13:53 2003
@@ -347,8 +347,19 @@
 
 
 	/**** PROCESS ALL PASSIVE CHECK RESULTS AT ONE TIME ****/
-	if(passive_check_result_list!=NULL)
-		process_passive_service_checks();
+        {
+                static unsigned int last_checked = 0;
+                unsigned int t;
+                                                                                                                       
+                        /* Don't process more frequently than once every 5 seconds. */
+                        /* This does a fork!!! */
+                time(&t);
+                if(passive_check_result_list!=NULL && ((t - last_checked) > 5) ) {
+                        process_passive_service_checks();
+                        last_checked = t;
+                }
+        }
+
 
 
 #ifdef DEBUG0
--- commands.c.old	Sat Aug  2 15:37:31 2003
+++ nagios-1.1/base/commands.c	Sat Aug  2 15:44:36 2003
@@ -63,7 +63,7 @@
 
 extern FILE     *command_file_fp;
 
-passive_check_result    *passive_check_result_list;
+passive_check_result    *passive_check_result_list = NULL;
 
 int             flush_pending_commands=FALSE;
 
@@ -96,9 +96,6 @@
 	/* update the status log with new program information */
 	update_program_status(FALSE);
 
-	/* reset passive check result list pointer */
-	passive_check_result_list=NULL;
-
 	/* reset flush flag */
 	flush_pending_commands=FALSE;
 
@@ -1279,13 +1276,9 @@
 
 	new_pcr->next=NULL;
 
-	/* add the passive check result to the end of the list in memory */
-	if(passive_check_result_list==NULL)
-		passive_check_result_list=new_pcr;
-	else{
-		for(temp_pcr=passive_check_result_list;temp_pcr->next!=NULL;temp_pcr=temp_pcr->next);
-		temp_pcr->next=new_pcr;
-	        }
+	/* add the passive check result to the head of the list in memory */
+	new_pcr->next = passive_check_result_list;
+	passive_check_result_list = new_pcr;
 
 #ifdef DEBUG0
 	printf("cmd_process_service_check_result() end\n");
@@ -2845,6 +2838,17 @@
 
 		/* the grandchild process should submit the service check result... */
 		if(pid==0){
+			passive_check_result *t, *t1;
+			/* Reverse passive list to correct the reverse that happened in collection. */
+
+			t = passive_check_result_list;
+			passive_check_result_list = NULL;
+			while (t) {
+				t1 = t->next;
+				t->next = passive_check_result_list;
+				passive_check_result_list = t;
+				t = t1;
+			}
 
 			/* write all service checks to the IPC pipe for later processing by the grandparent */
 			for(temp_pcr=passive_check_result_list;temp_pcr!=NULL;temp_pcr=temp_pcr->next){
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <https://www.monitoring-lists.org/archive/users/attachments/20030806/4bc07a39/attachment.sig>


More information about the Users mailing list