#!/etc/Tivoli/bin/perl # tivstat.pl # Script to watch for critical tivoli processes dying. # # Notes: # 1) Cache keys are forced to lowercase # status values of cache hash: # no key - this process has never been detected on this endpoint # 1 - this process is running # 0 - this process was running but has stopped # 2) Includes work done by James Boone (see tme10 email forum "postemsg.pl") # 3) Intended to be run as an lcfd boot_method with its own dependency set: # on NT, # - ntprocinfo.exe (generates a process list on NT - Tivoli supplied) # - perl.exe (V4 Tivoli supplied) # - tivstat.tecs (data file that has to be created) # on UNIX, # - perl (V4 Tivoli supplied) # - tivstat.tecs (data file that has to be created) # # V1.0, Original, Paul Claridge, 06Jun2001 # V1.1 Paul Claridge, 12Jun2001 # Added more logic to correctly detect oserv restart # Tightened up string matches to avoid bad process matching # V1.2 Paul Claridge, 13Jun2001 # Further tightening of string matching to avoid wrong alerts in the # case where a process is a substring of another process # You probably need to review all of these and the %tiv_sev hash! $ENV{'UNIX95'}=1; # improves portability for ps on UNIXs $want_procs=1; # 1 - check processes, 0 - don't $want_hb=1; # 1 - generate heartbeat, 0 - don't $proc_class='Tivoli_process;source=ALL;'; # class and source for process events $heartbeat_class='Tivoli_heartbeat;source=ALL;'; # class and source for hearbeat events $proc_poll=$next_procs=20; # interval in seconds for looking at the process list $hb=$next_hb=60; # interval in seconds for sending a heartbeat $do_procs=$want_procs; # set flag for first time through $do_hb=$want_hb; # set flag for first time through %tiv_sev=('tmf_sched','FATAL', # associate a TEC severity with each process. 'ep_mgr','FATAL', 'rptm','FATAL', 'oserv','FATAL', 'lcfd','MINOR', 'gateway','FATAL', 'rim_oracle_prog','FATAL', 'trip','MINOR', 'spider','MINOR', 'rcserv','MINOR', 'ntfserver','CRITICAL', 'sentry_engine','CRITICAL', 'sentry_gateway','CRITICAL', 'tec_server','CRITICAL', 'tec_gateway','CRITICAL', 'oracle73','CRITICAL', 'tnslsnr','CRITICAL', 'tecadnts','CRITICAL', 'tecad_snmps','CRITICAL', 'dm_ep_engine','CRITICAL'); # make list of tivoli procs of interest @tivprocs=keys %tiv_sev; # associate command to generate proc list with interp %proc_gen=('w32-ix86','ntprocinfo','aix4-r1','ps -eo pid,args','solaris2','ps -eo pid,args'); $proc_cmd=$proc_gen{$ENV{'INTERP'}}; # determine files location $dir=$ENV{'LCF_DATDIR'} || $ENV{'PWD'} || '.'; $cache_file="$dir/tivstat.cache"; $log="$dir/tivstat.log"; $tec_list="$dir/tivstat.tecs"; $pid="$dir/tivstat.pid"; &init; # check that we are not running already, open files etc # determine endpoint name chop($h=`hostname`); $me= &get_ep_name || $h || $ENV{'COMPUTERNAME'} || $ENV{'HOST'}; # set up signal handlers (NT only seems to catch Cntrl-C) for $s (HUP,INT,QUIT,KILL) { $SIG{$s}='signals'; } # setup some static event information $origin=join('.',unpack(C4,(gethostbyname($me))[4])); $static="hostname=$me;origin=$origin;"; # loop continuously with variable sleeps while (1) { $loop_start=time; # capture loop start $pmsg=''; # reset packed message of event(s) &check_procs if $do_procs && $want_procs; &create_heartbeat if $do_hb && $want_hb; if ($pmsg) { # send any events in one go, try tecs in order for $t (@tec_servers) { ($techost,$pm_flag,$tec_recv)=split(/:/,$t); last if &send_events($tec_recv); next unless $pm_flag; # if this tec fails and is portmapped retry once in case of restart unless ($retried) { $new_triple=join(':',$techost,$pm_flag,pack('S n a4 x8',2,&get_port($techost),$tec_addrs{$techost})); $t=~s/^(.*)$/$new_triple/; $retried=1; redo; # try that one again } else { $retried=0; } } } print CACHE join(',',%cache); seek(CACHE,0,0); # just keep latest values (1 line) if ($t=&calc_wait(time)) { sleep $t; } } #### subroutines #### sub init { # open log file for errors etc open(LOG,">>$log") || warn "Could not open log file <$log>\n"; select LOG; $|=1; unless ($want_procs || $want_hb) { print LOG &now." # Neither \$want_procs nor \$want_hb flags have been set, exiting..\n" if LOG; exit; } # check we have command for this interp if ($proc_cmd) { &get_procs; } else { print LOG &now." # Do not know process command to run for interp <$ENV{'INTERP'}>\n" if LOG; exit; } # check for already running tivstat and exit if found if (open(PID,"$pid")) { $last_pid=; exit 0 if grep(/"$last_pid:perl"/,@p); close PID; } open(PID,">$pid"); print PID $$; close PID; # write new pid file print LOG &now." # --- started, pid <$$>\n" if LOG; # read disk file if it exists - saved cache across restarts/reboots if (open(IN,"$cache_file")) { %cache=split(',',); close IN; } else { %cache=(); } # open new cache file for writing latest cache if (open(CACHE,">$cache_file")) { select CACHE; $|=1; } else { print LOG &now." # Could not open cache file <$cache_file>\n" if LOG; } # read tivstat.tecs file for tecserver(s) if (open(TL,"$tec_list")) { chop($tecs=); close TL; @tec_servers=split(',',$tecs); for $t (@tec_servers) { local($techost,$portmapper_flag)=split(/:/,$t); $tec_addrs{$techost}=(gethostbyname($techost))[4]; $portmapper_flag ? ($port=&get_port($techost)) : ($port=5529); $t.=":".pack('S n a4 x8',2,$port,$tec_addrs{$techost}); } } else { @tec_servers=(); print LOG &now." # No TEC servers defined in <$tec_list>, unable to send events!\n" if LOG; } } sub get_procs { (@running,@not_running)=(); # reset for each run @p=grep(s/^\s*(\d*)\s*(\S*).*$/"$1:$2"/,`$proc_cmd`); # make list of running processes for $p (@tivprocs) { (grep(/\b$p\b/i,@p)) ? push(@running,"\L$p") : push(@not_running,"\L$p"); } } sub check_procs { &get_procs; # catch oserv restart if (grep(/^oserv$/,@running) && defined $cache{'oserv'} && $cache{'oserv'}==0) { &create_proc_event('oserv','restarted',HARMLESS); $cache{'oserv'}++; # now check children have restarted while ($r=shift @o_children) { if (grep(/^$r$/,@running)) { $cache{$r}++; } else { &create_proc_event($r,'not restarted with oserv',$tiv_sev{$r}); } } } for $r (@running) { next if $cache{$r}; &create_proc_event($r,'restarted',HARMLESS) if grep(/^$r$/,keys %cache); $cache{$r}++; } # catch oserv failure to send only one event if (grep(/^oserv$/,@not_running) && $cache{'oserv'}) { $o_stop=1; } else { $o_stop=0; } for $n (@not_running) { $cache{$n} ? $cache{$n}-- : next; if ($n=~/^oserv$/ || !$o_stop) { &create_proc_event($n,'stopped',$tiv_sev{$n}); } if ($n!~/^oserv$/ && $o_stop) { push(@o_children,$n); } } } sub calc_wait { $next_procs-=($_[0]-$loop_start); $next_hb-=($_[0]-$loop_start); if ($next_procs <= 0 || $next_hb <= 0 ) { if ($next_procs <= 0) { $do_procs=1; $next_procs+=$proc_poll; } else { $do_procs=0; } if ($next_hb <= 0) { $do_hb=1; $next_hb+=$hb; } else { $do_hb=0; } return 0; } elsif ($next_procs > $next_hb) { $wait=$next_hb; $do_hb=1; $do_procs=0; $next_procs-=$next_hb; $next_hb=$hb; } elsif ($next_procs < $next_hb) { $wait=$next_procs; $do_procs=1; $do_hb=0; $next_hb-=$next_procs; $next_procs=$proc_poll; } else { $wait=$next_procs; $do_procs=1; $do_hb=1; $next_procs=$proc_poll; $next_hb=$hb; } return $wait; } sub create_heartbeat { local($date)="date=".&now; local($msg)=$heartbeat_class.$static."$date;severity=HARMLESS;".'END'; $msg_l=length($msg); $term_l=$msg_l+2; # text terminated by newline and Cntrl-A (0x01) local($packed)=pack("a8 N7 a$msg_l c2",">",0,0,0,0,0,$term_l,$term_l,$msg,0x0a,0x01); $pmsg.=$packed; } sub create_proc_event { local($proc,$mode,$sev)=@_; local($date)="date=".&now; local($msg)=$proc_class.$static."$date;msg=\"<$proc> has $mode.\";severity=\U$sev;".'END'; print LOG &now." # <$msg>\n" if LOG; $msg_l=length($msg); $term_l=$msg_l+2; # text terminated by newline and Cntrl-A (0x01) local($packed)=pack("a8 N7 a$msg_l c2",">",0,0,0,0,0,$term_l,$term_l,$msg,0x0a,0x01); $pmsg.=$packed; } sub now { local(@t)=localtime; $t[4]++; $t[5]+=1900; for (@t){ $_=~s/(.*)/sprintf("%02d",$_)/e; } return "$t[3]/$t[4]/$t[5]-$t[2]:$t[1]"; } sub send_events { if (socket(S,2,1,6) && connect(S,$_[0])) { select S; $|=1; $rc=send(S,$pmsg,0); close S; unless ($rc) { print LOG &now." # ERROR sending event(s) to <$techost>\n" if LOG; } } else { print LOG &now." # socket connect error to <$techost>\n" if LOG && (!$pm_flag || $retried); $rc=0; } return $rc; } # send udp packet to portmapper to get tec recv port sub get_port { local($dg)=pack('N13 x2',12345,0,2,0x0186a0,2,3,0,0,0,0,100033057,1,6); local($tec_pm)=pack('S n a4 x8',2,111,$tec_addrs{$_[0]}); socket(PM,2,2,17) || return -1; send(PM,$dg,0,$tec_pm) || return -1; recv(PM,$reply,256,0); close PM; return (unpack(N7,$reply))[6]; } sub get_ep_name { if (open(CFG,"$ENV{'LCF_DATDIR'}/last.cfg")) { local($ep_name)= grep(do{chop;s/^lcs.machine_name=(.*)$/$1/},); close CFG; return $ep_name; } } sub signals { print LOG &now." # Caught signal <$_[0]>, exiting..\n" if LOG; close CACHE; close LOG; exit; }