#!/usr/bin/perl -w
#
# check_esx
# Check the status of a virtual machine on a VMware ESX server, via SNMP
# Return status in format for either Nagios or MRTG
#
# Steve Shipway (www.steveshipway.org) Nov 2004
# Released under GNU GPL
#
# Usage:
# check_esx -C community -H hostname -v virtualhost [ -N | -M ] [-l thing [-w warn -c crit]]
# -N nagios mode (need -w and -c), -M MRTG mode
# -l thing can be CPU, MEM, STATE
# default community is public
# default mode is Nagios

use Net::SNMP;
use Getopt::Std;

my($VMOID) = "1.3.6.1.4.1.6876";
my($OK,$WARNING,$CRITICAL,$UNKNOWN) = (0,1,2,3);
my($DEBUG) = 0;
my($TIMEOUT) = 15;
my($from,$to) = (0,99999);
my($snmp,$resp,$snmperr);
my($hostname) = '';
my($community) = 'public';
my($vhost) = '';
my($A, $B, $MSG) = ('U','U','');
my($STATUS) = $UNKNOWN;
my($MODE) = 0;
my($VMID) = -1; # set to -1 if not running
my($VMNO) = -1; # set to -1 if not defined
my($warn,$crit) = (0,0);

use vars qw($opt_C $opt_H $opt_N $opt_M $opt_h $opt_c $opt_t $opt_d $opt_w $opt_l $opt_v);

sub dohelp {
	print "Usage: check_esx [-d][-h] -H host [-C community][-N | -M]\n                [-l thing [-c crit -w warn]]\n";
	print "-l can be CPU(not for Nagios) MEM STATE(1/0 in MRTG) LIST\n";
	print "Thresholds are for MEM or LIST under Nagios, can be in K or %\n";
	print "eg: -w 1024K       -w 80%\n";
	print "    MEM is memory remaining; CPU is total CPU seconds.\n";
	exit 0;
}

sub dooutput {
	if( $MODE ) {
		# MRTG
		$B = $A if(!defined $B);
		$MSG = "Returned values: $A, $B\n" if(!$MSG);
		print "$A\n$B\n\n$MSG\n";
		exit 0;
	} else {
		# Nagios
		print "$MSG\n";
		exit $STATUS;
	}
}

sub getvmid {
	my(%lookup) = ();

	print "(snmp lookup)\n" if($DEBUG);

	($snmp,$snmperr) = Net::SNMP->session( -hostname=>$hostname,
		-community=>$community, -timeout=>$TIMEOUT );
	if($snmperr) {
		print "($snmperr)\n" if($DEBUG);
		$MSG = "Error: $snmperr";
		dooutput; # exit 
		exit(0);
	}
	$resp = $snmp->get_table( -baseoid=>"$VMOID.2.1.1");
	foreach my $oid ( keys %$resp ) {
		$oid =~ /(\d+)\.(\d+)$/;
		if( $1 == 2 ) {
			$lookup{$resp->{$oid}} = $2;
		} elsif( $1 == 7 ) {
			$lookup{$2} = $resp->{$oid};
		}
	}
	if(defined $lookup{$vhost}) {
		$VMNO = $lookup{$vhost};
		if( defined $lookup{$VMNO} ) {
			$VMID = $lookup{$VMNO};
		} else {
			$MSG = "Virtual host $vhost($VMNO) is not running!";
		}
	} else {
		$MSG = "Virtual host $vhost is not defined!";
	}

	print "(hostno=$VMNO, ID=$VMID)\n" if($DEBUG);
}

sub listvm {
	my(%lookup,@vh,$val);
	%lookup = (); @vh = ();
	$A = $B = 0;
	print "(snmp lookup)\n" if($DEBUG);

	($snmp,$snmperr) = Net::SNMP->session( -hostname=>$hostname,
		-community=>$community, -timeout=>$TIMEOUT );
	if($snmperr) {
		print "($snmperr)\n" if($DEBUG);
		$MSG = "Error: $snmperr";
		dooutput; # exit 
		exit(0);
	}
# print "about to get table: " . `date` . "$VMOID.2.1.1\n";
	$resp = $snmp->get_table( -baseoid=>"$VMOID.2.1.1");
# print "got          table: " . `date` . "\n";
	if ( ! $resp ) {
		if ( $snmp->error
			eq "Requested table is empty or does not exist" ) {
			$MSG = "No VMs configured on $hostname";
			$STATUS = $OK;
			return;
		}
		$MSG = "Unable to retrieve VM table for $hostname: "
			. $snmp->error;
		$STATUS = $UNKNOWN;
		return;
	}
	foreach my $oid ( keys %$resp ) {
		print "(checking snmp oid $oid)\n" if($DEBUG);
		$oid =~ /(\d+)\.(\d+)$/;
		if( $1 == 2 ) {
			print "(matches 2 in $oid)\n" if($DEBUG);
			$lookup{$resp->{$oid}} = $2;
			push @vh, $resp->{$oid};
		} elsif( $1 == 6 ) {
			print "(matches 6 in $oid)\n" if($DEBUG);
			$lookup{$2} = $resp->{$oid};
		} else {
			print "(non-matches $1 in $oid)\n" if($DEBUG);
		}
	}
	foreach ( @vh ) {
		$B++;	# count of machines
		$val = $lookup{$lookup{$_}};
		if ( defined( $val ) && $val eq "on" ) {
			$_ .= "($val)";
			$A++;	# count of on machines
		}
	}
	$MSG = "VHosts: $A/$B on: ".(join ", ",@vh);
	$STATUS = $OK;  
}

sub readcpu {
	my($k) = "$VMOID.3.1.2.1.3.$VMID";
	print "(retrieving $k)\n" if($DEBUG);
	$resp = $snmp->get_request( -varbindlist=>[ $k  ] );
	if( $resp ) {
		$A = $resp->{$k}; $B = 0;
	} else {
		$MSG = "Unable to retrieve CPU statistics for $vhost: ".$snmp->error;
		$STATUS = $UNKNOWN;
	}
}
sub readmem {
	my($k1) = "$VMOID.3.2.4.1.3.$VMID";
	my($k2) = "$VMOID.3.2.4.1.4.$VMID";
	print "(retrieving $k1,$k2)\n" if($DEBUG);
	$resp = $snmp->get_request( -varbindlist=>[$k1,$k2] );
	if( $resp ) {
		$A = $resp->{$k2}; $B = $resp->{$k1}*1024;
		$A = $B - $A; # memory remaining
	} else {
		$MSG = "Unable to retrieve memory statistics for $vhost: ".$snmp->error;
		$STATUS = $UNKNOWN;
	}
}

###########################################################################
getopts('hH:c:t:dv:w:NMC:l:');
$hostname = $opt_H if($opt_H);
$vhost = $opt_v if($opt_v);
$warn = $opt_w if($opt_w);
$crit = $opt_c if($opt_c);
$TIMEOUT = $opt_t if($opt_t);
$MODE = 1 if($opt_M);
$community = $opt_C if($opt_C);
$DEBUG=1 if($opt_d);
dohelp if($opt_h);

if(!$hostname) {
	$MSG = "No ESX server hostname specified!";
	dooutput;
	exit 0;
}
if( $opt_l =~ /LIST/i ) {
	if ( ! $opt_t ) {
		# no user-specified timeout, so
		# increase the timeout while we're getting the table
		$TIMEOUT *= 2;
	}
	listvm;
	if ( $A < $B ) {	# not all machines are up
		if ( $warn == 0 || ($warn>0 && ($B-$A)>=$warn) ) {
			$STATUS = $WARNING;
		}
		# if nothing is up, that's critical, unless we set $crit
		if ( ( $crit == 0 && $A==0 ) || ($crit>0 && ($B-$A)>=$crit) ) {
			$STATUS = $CRITICAL;
		}
	}
	dooutput;
	exit 0;
}
if(!$vhost) {
	$MSG = "No virtual hostname specified!";
	dooutput;
	exit 0;
}
if( !$opt_l  ) {
	$MSG = "You need to specify a command!";
	dooutput;
	exit 0;
}
if( $opt_l !~ /CPU|MEM|STAT/i ) {
	$MSG = "Bad command $opt_l!";
	dooutput;
	exit 0;
}
if( $opt_l =~ /CPU/ and !$MODE ) {
	$MSG = "Cannot check CPU in Nagios mode"; 
	dooutput;
	exit 0;
}
if( $opt_l =~ /MEM/ and !$MODE and (!$crit or !$warn)) {
	$MSG = "Invalid warn/critical thresholds for Memory Used"; 
	dooutput;
	exit 0;
}


# Now, we have host, vhost, community, and command
getvmid; # also opens SNMP object
if( $opt_l =~ /STAT/i ) {
	if( $VMID < 0 ) {
		$STATUS = $CRITICAL; ($A,$B) = (0,0);
	} else {
		$STATUS = $OK; ($A,$B) = (1,1);
		$MSG = "VHost $vhost is up (ID: $VMID)";
	}
	dooutput;
	exit 0;
}
if( $VMID < 0 ) {
	$STATUS = $CRITICAL;
	dooutput;
	exit 0;
}

$STATUS = $OK;
if( $opt_l =~ /CPU/i ) {	
	$MSG = "";
	readcpu;
	$MSG = "CPU has used $A seconds" if(!$MSG);
} else {
	my($pc,$tot,$av,$sfx);
	$MSG = "";
	readmem;
	if(!$MSG) {
		$pc = int($A/$B*10000.0)/100.0;	
		$sfx = "Kb"; $av = $A;
		if($av>2047) { $av = int($av/10.24)/100.0; $sfx="Mb"; }
		$av .= $sfx;
		$sfx = "Kb"; $tot = $B;
		if($tot>2047) { $tot = int($tot/10.24)/100.0; $sfx="Mb"; }
		$tot .= $sfx;
		$MSG = "Memory free: $av ($pc%) [Total available $tot]" ;
	}
}

if( !$MODE and $STATUS==$OK ) {
	# Set Nagios thresholds
	if( $warn =~ /([\d\.]+)%/ ) { $warn = $B * $1 / 100.0; }
	if( $crit =~ /([\d\.]+)%/ ) { $crit = $B * $1 / 100.0; }
	if( $warn =~ /([\d\.]+)M/i ) { $warn = $1 * 1024; }
	if( $crit =~ /([\d\.]+)M/i ) { $crit = $1 * 1024; }
	if( $warn =~ /([\d\.]+)K/i ) { $warn = $1; }
	if( $crit =~ /([\d\.]+)K/i ) { $crit = $1; }
	$STATUS = $WARNING  if( $A <= $warn );
	$STATUS = $CRITICAL if( $A <= $crit );
}

$snmp->close;
dooutput;
exit 0;
