check_proc --metric=CPU - alternatives?

Kyle O'Donnell nagios at isprime.org
Fri Aug 13 13:01:56 CEST 2010


I wrote one.  I cant find it online anymore but here it is:

Tries to use top if in linux, in solaris and aix i have a new version of
top installed into my nagios tree, otherwise it falls back on prstat or ps

feel free to use, might be buggy though!


On Fri, 13 Aug 2010 12:11:37 +0200, Sebastian Ries
<Sebastian.Ries at dtnet.de> wrote:
> Hi
> 
>> According to the documentation, the -n option is used to give a regex
>> which selects which process(es) you are monitoring and the -u option
>> should work on the sum of the CPU for those selected processes only.
>> 
>> The example given is:
>> 
>> ./check_snmp_process.pl -H 127.0.0.1 -C public -n http -w 3,8 -c 0,15
>> -m 9,25 -u 70,99
> 
> This is what I tried but as I described all processes have the same
> name :-(
> 
>> which should alert if the total cpu of all processes haveing 'http' in
>> the name is > 70.
> 
> And I want to get an alert if ANY of these processes uses more than the
> given value.
> 
>> To get the plugin to do exactly what you describe, you will need to
>> edit the perl code a bit.  Bear in mind that you can never have more
>> than one process using >90% cpu in the same period!
> 
> Not really true ;-)
> As this machine has 8 real CPU-Cores with HT enabled there can be up to
> 16 Processes with more than 90% CPU usage ;-)
> 
> I will have a look in the perl code. Maybe it is a only small
> adaption...
> 
> Regards
> Sebastian Ries
-------------- next part --------------
#!/opt/nagios/bin/perl
#

use strict;
use Getopt::Long;
use POSIX "uname";
use vars qw($PROGNAME);
use lib "/opt/nagios/libexec";
use utils qw (%ERRORS &print_revision &support &usage);

sub print_help ();
sub print_usage ();

# set variables
my ($opt_c, $opt_w, $opt_a, $opt_m, $opt_p, $opt_h, $opt_V, $PROGVER, $opt_t, $deftype);
my ($line, $pid, $user, $prior, $nice, $vsz, $rss, $shr, $state, $cpu, $mem, $time, $cmd, @args);
my ($chkcmd, $tty, $start, $tty, $date1, $date2, $date3, $startm, $startd);
my ($avg, $m, $sumavg, %add, $message, @arrcmd, $size, $pri, $lwp, $uid, $args, $wcpu);
my $loop = 0;
my $sum = 0;
my $pidcount = 0;
my $exitcode = 0;
my $opt_i = 2;
my $opt_d = 1;

$PROGNAME="check_procs_usage";
$PROGVER="1.0";

# set commands to gather data for each OS
my ($sysname, $nodename, $release, $version, $machine ) = uname;

Getopt::Long::Configure('bundling');
GetOptions(
        "V"   => \$opt_V, "version"     => \$opt_V,
        "h"   => \$opt_h, "help"        => \$opt_h,
        "m=s" => \$opt_m, "metric=s"    => \$opt_m,
        "w=f" => \$opt_w, "warn=f"      => \$opt_w,
        "c=f" => \$opt_c, "crit=f"      => \$opt_c,
        "p=s" => \$opt_p, "pidname=s"   => \$opt_p,
        "a=s" => \$opt_a, "args=s"      => \$opt_a,
        "d=f" => \$opt_d, "delay=f"     => \$opt_d,
        "t=s" => \$opt_t, "type=s"      => \$opt_t,
        "i=f" => \$opt_i, "iter=f"      => \$opt_i);

if ($sysname =~ /Linux/) {

	$deftype = "top";

	if (($opt_t =~ /$deftype/) || ($opt_t eq "")) {

        	$chkcmd = "/usr/bin/top -b -c -n $opt_i -d $opt_d";
	}
	else {
		print "*** $sysname only supports $deftype ***\n"
	}
}
elsif ($sysname =~ /SunOS/) {

	$deftype = "top";

	if (($opt_t =~ /$deftype/) || ($opt_t eq "")) {
		$chkcmd = "/opt/nagios/bin/top -b -c -C -u -d $opt_i -s $opt_d all";
	}
	elsif ($opt_t =~ /prstat/) {
		$chkcmd = "/usr/bin/prstat -n 10000 $opt_d $opt_i | /usr/bin/tee";
	}
	else {
		printf "*** $sysname doesn't support $opt_t ***\n";
	}

}
elsif ($sysname =~ /AIX/) {

	$deftype = "ps";

	if (($opt_t =~ /$deftype/) || ($opt_t eq "")) {
		$chkcmd = "/usr/bin/ps auxww";
	}
	elsif ($opt_t =~ /top/) {
		$chkcmd = "/opt/nagios/bin/top -b -c -C -u -d $opt_i -s $opt_d all";
	}
	else {
		print "*** $sysname does not support $opt_t ***\n";
	}	
}
else {
        printf "$sysname not supported";
}

if ($opt_V) {
	print "$PROGNAME $PROGVER\n";
	exit $ERRORS{'UNKNOWN'};
}

if ($opt_h) {
	print_help();
	exit $ERRORS{'UNKNOWN'};
}

# if required options not set print usage and exit
if ((! $opt_w) || (! $opt_c) || (! $opt_m) || (! $opt_p)) {
	print_usage();
	exit $ERRORS{'UNKNOWN'};
}

# exit if any of the input does not match requirements
if (($opt_w !~ /[0-9]/) || ($opt_c !~ /[0-9]/) || ($opt_m !~ /(CPU|MEM)/) || ($opt_i !~ /[0-9]/) || ($opt_d !~ /[0-9]/)) {

        print_help;
        print "\n***unsupported options: warn($opt_w) crit($opt_c) metric($opt_m) delay($opt_d) iterations($opt_i) ***\n\n";
        exit $ERRORS{UNKNOWN};
}

sub print_usage () {
	print "\nusage:\n";
	print "  $PROGNAME -w <warn> -c <crit> -m <metric> -p <pidname> [-a pidarg] [-d delay] [-i iterations] [-t type] \n";
	print "  $PROGNAME [-h | --help]\n";
	print "  $PROGNAME [-V | --version]\n\n";
}

sub print_help () {
	print "\n$PROGNAME $PROGVER\n\n";
	print "Kyle O'Donnell (03-05-2009)\n";
	print_usage();
	print "\n";
	print "  warn\t\tpercent <metric> used resulting in warning state\n";
	print "  crit\t\tpercent <metric> used resulting in critical status\n";
	print "  metric\tCPU or MEM\n";
	print "  pidname\tname of process to search for\n";
	print "  pidarg\targument of a process fot search for\n";
	print "  delay\t\tdelay in seconds between polling of data (default: 1)\n";
	print "  iter\t\tnumber of iterations of the top output (default: 2)\n";
	print "  type\t\tthe command type to use for obtaining metrics (default: see below)\n\n";
	print "example:  $PROGNAME -w 80 -c 90 -m CPU -p nscd -d 2 -i 5\n";
	print "  monitor cpu usage of the nscd process, wait 2 secs between collection of 5 data sets\n\n";
	print "example:  $PROGNAME -w 80 -c 90 -m CPU -p java -a xyz\n";
	print "  monitor cpu usage of a java process with the argument ayx\n\n";

	if ($sysname =~ /SunOS/) {
	
		print "*** SunOS detected ... ***\n";
		print "*** MEM is not supported! ***\n";
		print "*** types supported top ps prstat (default: top) ***\n";
		print "*** prstat does not print arguments -a ignored! ***\n\n";
	}

	if ($sysname =~ /AIX/) {

		print "*** AIX detected ... ***\n";
		print "*** types supported: top* ps (default: ps) ***\n";
		print "*** man ps to determine if the way CPU% is calcuated matches your requirements ***\n\n";
		print "*** expirimental* unixtop does not work fully in AIX ***\n";
		print "*** top does not print command names properly ***\n";
		print "*** top does not print MEM% -m MEM ignored! ***\n";
		print "*** top does not print command arguments ***\n\n";
	}

	if ($sysname =~ /Linux/) {

		print "*** Linux detected ... ***\n";
		print "*** types supported: top (default: top) ***\n\n";
	}
	
}

####################################################
#
# get data by running the command and looping where appropriate
#                                                  
####################################################

sub get_data () {

my @output = `$chkcmd`;

# this is the most accurate because we can output multiple instaces of top and loop through each set
# i have chosen to considered process or processes with the same arguments as one metric
# ie: if there are 8 processes with the word httpd and each is consuming 1.0% memory this script
# considers httpd to be consuming 8.0% memory (8 processes * 1.0% memory)


if ($sysname =~ /Linux/) {

	foreach $line (@output) {

		chomp $line;

		# start the loop on PID, and calculate the sums for each processes until the next PID
		# PID is a unique identifier used to determine when the next dataset from 'top' starts

		if ($line =~ /PID/) {

			$add{$loop} = $sum;
			undef $sum;
			$loop++;
		}

		else {
			$line =~ s/^[\s]+//g;

			($pid, $user, $prior, $nice, $vsz, $rss, $shr, $state, $cpu, $mem, $time, $cmd, @args) =  split(/[\s]+/, $line);

			if ($cmd =~ /^\//) {

				# extract the binary name of the command running if path is present                        
				@arrcmd = split(/\/+/, $cmd);
				$cmd = $arrcmd[$#arrcmd];
			}

			if (!defined($opt_a)) {
		
				if (($cmd =~ /$opt_p/) && ($line !~ /$PROGNAME/)) {		
				
					if ($opt_m eq "CPU") { $sum = $sum + $cpu; }
					if ($opt_m eq "MEM") { $sum = $sum + $mem; }

					# this variable is used to determine how many $opt_p's are running
					$pidcount = $pidcount + 1;
				}
				else {
					$message = "OK - process $opt_p is not running";
				}
			}
			elsif (defined($opt_a)) {
			
				if (($cmd =~ /$opt_p/) && ("@args" =~ /$opt_a/) && ($line !~ /$PROGNAME/)) {
				
					if ($opt_m eq "CPU") { $sum = $sum + $cpu; }
					if ($opt_m eq "MEM") { $sum = $sum + $mem; }
					$pidcount = $pidcount + 1;
				}
				else {
					$message = "OK - process $opt_p with argument $opt_a is not running";
				}
			}
			else {
				print "something whacky!";
			}

		} #else

	} #foreach $line

	# this is sneaky

	$add{$loop} = $sum;

	foreach $m (keys %add) {
		if ($m > 0) {
			$sumavg = $add{$m} + $sumavg;
		}
	} #foreach $m

	# calculate average based on sum divided by iterations
	$avg = $sumavg / $opt_i;

	# since linux uses top and can get multiple datasets we divide $pidcount by $loop to normalize the number

	$pidcount = $pidcount / $loop;

} #if Linux

elsif ($sysname =~ /AIX/) {

# aix doesn't have top, or anything equiv out of the box
# I've decided to use ps with berkeley options as this gives me the data needed
# since this is only one data set it is less accurate as Linux 
# because we do not calculate average based iterations

foreach $line (@output) {

	chomp $line;

	if ($line =~ /PID/) {

		$add{$loop} = $sum;
		undef $sum;
		$loop++;
	}

	else {

		$line =~ s/^[\s]+//g;
                                    
		if ($chkcmd =~ /top/) {
                                       
			# top 3.6.1 values
			($pid, $uid, $pri, $nice, $size, $rss, $state, $time, $wcpu, $cpu, $cmd) = split(/[\s]+/, $line);
		}
               
		else {
			# ps values
			($user, $pid, $cpu, $mem, $vsz, $rss, $tty, $state, $start, $time, $cmd, @args) =  split(/[\s]+/, $line);

			if ($start =~ /[a-z]/) {
				($user, $pid, $cpu, $mem, $vsz, $rss, $tty, $state, $startm, $startd, $time, $cmd, @args) =  split(/[\s]+/, $line);
			}
			chomp @args;
		}
              
		if ($cmd =~ /^\//) {

			@arrcmd = split(/\/+/, $cmd);
			$cmd = $arrcmd[$#arrcmd];
		}

		if (!defined($opt_a)) {

			if (($cmd =~ /$opt_p/) && ($line !~ /$PROGNAME/)) {

				if ($opt_m eq "CPU") { $sum = $sum + $cpu; }
				if ($opt_m eq "MEM") { $sum = $sum + $mem; }
				$pidcount = $pidcount + 1;
			}

			else { 
				$message = "OK - process $opt_p is not running";
			}

		} #if !defined

		elsif (defined($opt_a)) {

			if ($chkcmd =~ /ps/) {

				if (($cmd =~ /$opt_p/) && (@args =~ /$opt_a/) && ($line !~ /$PROGNAME/)) {
	
					if ($opt_m eq "CPU") { $sum = $sum + $cpu; }
					if ($opt_m eq "MEM") { $sum = $sum + $mem; }
					$pidcount = $pidcount + 1;
				}

				else {
					$message = "OK - process $opt_p with argument $opt_a is not running";
				}

			} #if chkcmd ps
			
			else {
				print "*** -a option cannot be used with top ***\n";
				exit 1;

			} #else chkcmd ps

		} #elsif defined'

		else {
			print "something whacky!";
		}

	} #else PID

} #foreach $line

$add{$loop} = $sum;

foreach $m (keys %add) {
	if ($m > 0) {
		$sumavg = $add{$m} + $sumavg;
	}
} #foreach $m

$avg = $sumavg;

$pidcount = $pidcount / $loop


} #elsif AIX

elsif ($sysname =~ /SunOS/) {

foreach $line (@output) {

	chomp $line;

	# start the loop on PID, and calculate the sums for each processes until the next PID
	# PID is a unique identifier used to determine when the next dataset from 'prstat' starts

	if ($line =~ /PID/) {

		$add{$loop} = $sum;
		undef $sum;
		$loop++;
	} #if PID

	else {

		$line =~ s/^[\s]+//g;

		if ($chkcmd =~ /top/) {
		
			# top 3.6.1 values
			($pid, $uid, $lwp, $pri, $nice, $size, $rss, $state, $time, $cpu, $cmd, @args) = split(/[\s]+/, $line);
			chomp @args;
		} 

		else {
			# prstat values
			($pid, $user, $size, $rss, $state, $pri, $nice, $time, $cpu, $cmd) =  split(/[\s]+/, $line);
		}

		if ($cmd =~ /^\//) {
		
			@arrcmd = split(/\/+/, $cmd);
			$cmd = $arrcmd[$#arrcmd];

		} #if $cmd

		if ($cpu =~ /%/) {

			$cpu =~ s/%//;
		}

		if (!defined($opt_a)) {
	
			if (($cmd =~ /$opt_p/) && ($line !~ /$PROGNAME/)) {

				if ($opt_m eq "CPU") { $sum = $sum + $cpu; }
				if ($opt_m eq "MEM") { $sum = $sum + $mem; }

				# this variable is used to determine how many $opt_p's are running
				$pidcount = $pidcount + 1;
			}
			else {
				$message = "OK - process $opt_p is not running";
			}

		} #if !defined

		elsif (defined($opt_a)) {

			if ($chkcmd =~ /top/) {

				if (($cmd =~ /$opt_p/) && ("@args" =~ /$opt_a/) && ($line !~ /$PROGNAME/)) {
		
					if ($opt_m eq "CPU") { $sum = $sum + $cpu; }
					if ($opt_m eq "MEM") { $sum = $sum + $mem; }
			
					$pidcount = $pidcount + 1;
				}
				else {
					$message = "OK - process $opt_p with argument $opt_a is not running";
				}

			} #if chkcmd top

			else {
				print "*** -a option cannot be used with prstat, please install top ***\n";
				exit 1;

			} #else chkcmd top

		} #elsif defined

		else {
			print "something whacky\n";
		}
	
	} #else PID

} #foreach $line

$add{$loop} = $sum;

foreach $m (keys %add) {

	if ($m > 0) {
		$sumavg = $add{$m} + $sumavg;
	}

} #foreach $m

$avg = $sumavg / $opt_i;

$pidcount = $pidcount / $loop;


} #elsif SunOS

else {
	printf "what did i tell you about $sysname!\n";
}
		
} #get_data

####################################################
#
# this sub routine just evaluates output and exits with appropriate message/exitcode
#
####################################################

sub do_math() {

get_data();

if (!defined($sum)) {
	
	$message = $message;
}
elsif ($avg < $opt_w) {

	if (!defined($opt_a)) {

		$message = "OK - $pidcount instance(s) of $opt_p using $avg% $opt_m (<$opt_w%)";
	}

	elsif (defined($opt_a)) {
		
		$message = "OK - $pidcount instance(s) of $opt_p with arg $opt_a using $avg% $opt_m (<$opt_w%)";
	}
}
elsif ($avg > $opt_c) {

	if (!defined($opt_a)) {

		$message = "CRITICAL - $pidcount instance(s) of $opt_p using $avg% $opt_m (>$opt_c%)";
	}
	elsif (defined($opt_a)) {

		$message = "CRITICAL - $pidcount instance(s) of $opt_p with arg $opt_a using $avg% $opt_m (>$opt_c%)";
	}
	
	$exitcode = 2;		
}
elsif ($avg > $opt_w) {

	if (!defined($opt_a)) {
	
		$message = "WARNING - $pidcount instance(s) of $opt_p using $avg% $opt_m (>$opt_w%)";
	}
	elsif (defined($opt_a)) {
	
		$message = "WARNING - $pidcount instance(s) of $opt_p with arg $opt_a using $avg% $opt_m (>$opt_w%)";
	}

	$exitcode = 1;
}
else {
	printf "UNKNOWN - logic broken";
	$exitcode = 3;
}

} #do_math

do_math();

print "$message\n";
exit $exitcode;
-------------- next part --------------
------------------------------------------------------------------------------
This SF.net email is sponsored by 

Make an app they can't live without
Enter the BlackBerry Developer Challenge
http://p.sf.net/sfu/RIM-dev2dev 
-------------- next part --------------
_______________________________________________
Nagios-users mailing list
Nagios-users at lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nagios-users
::: Please include Nagios version, plugin version (-v) and OS when reporting any issue. 
::: Messages without supporting info will risk being sent to /dev/null


More information about the Users mailing list