#!/bin/sh
#
#+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#+-#+-#-+-#
#
# $SGE_ROOT/util/master_template
#
# DO NOT EDIT THIS FILE - this file is used as an template
# Don't change the markers #+-#+-#+-# and "#-#-#-#" , they will be removed
#
#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-
#
# SGE/SGEEE startup script
#
#___INFO__MARK_BEGIN__
##########################################################################
#
#  The Contents of this file are made available subject to the terms of
#  the Sun Industry Standards Source License Version 1.2
#
#  Sun Microsystems Inc., March, 2001
#
#
#  Sun Industry Standards Source License Version 1.2
#  =================================================
#  The contents of this file are subject to the Sun Industry Standards
#  Source License Version 1.2 (the "License"); You may not use this file
#  except in compliance with the License. You may obtain a copy of the
#  License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
#
#  Software provided under this License is provided on an "AS IS" basis,
#  WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
#  WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
#  MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
#  See the License for the specific provisions governing your rights and
#  obligations concerning the Software.
#
#  The Initial Developer of the Original Code is: Sun Microsystems, Inc.
#
#  Copyright: 2001 by Sun Microsystems, Inc.
#
#  All Rights Reserved.
#
##########################################################################
#___INFO__MARK_END__

#
# This script can be called with the following arguments:
#
#       start       start qmaster or shadowd 
#       stop        Terminates qmaster if we are on the master machine.
#       -qmaster    only starts qmaster 
#       -shadowd    start shadwod if found in the "shadow_masters" file
#       -migrate    shuts down qmaster if it is running
#                   on another host and start the daemons on this host
#
# If the file "primary_qmaster" in the $SGE_ROOT/$SGE_CELL/common
# exists and it contains the hostname of the current machine and qmaster
# is running on another host it will be shut down and started on this host
#
# Unix commands which may be used in this script:
#    cat cut tr ls grep awk sed basename
#
# This script requires the script $SGE_ROOT/util/arch
#

PATH=/bin:/usr/bin:/sbin:/usr/sbin

#---------------------------------------------------------------------------
# The following lines provide the necessary info for adding a startup script
# according to the Linux Standard Base Specification (LSB) which can
# be found at:
#
#    http://www.linuxfoundation.org/spec/booksets/LSB-Core-generic/LSB-Core-generic/initscrcomconv.html
#
### BEGIN INIT INFO
# Provides:       GENSGESVC
# Required-Start: $network $remote_fs
# Required-Stop:
# Default-Start:  3 5
# Default-Stop: 0 1 2 6
# Description:  start Grid Engine qmaster, schedd, shadowd
### END INIT INFO
#---------------------------------------------------------------------------

SGE_ROOT=GENROOT; export SGE_ROOT
SGE_CELL=GENCELL; export SGE_CELL
SGE_QMASTER_PORT=GENSGE_QMASTER_PORT; export SGE_QMASTER_PORT 
SGE_EXECD_PORT=GENSGE_EXECD_PORT; export SGE_EXECD_PORT

unset CODINE_ROOT GRD_ROOT COD_CELL GRD_CELL

ARCH=`$SGE_ROOT/util/arch`

# library path setting required only for architectures where RUNPATH is not supported
case $ARCH in
#ENFORCE_SHLIBPATH#sol*|lx*)
#ENFORCE_SHLIBPATH#   ;;
*)
   shlib_path_name=`$SGE_ROOT/util/arch -lib`
   old_value=`eval echo '$'$shlib_path_name`
   if [ x$old_value = x ]; then
      eval $shlib_path_name=$SGE_ROOT/lib/$ARCH
   else
      eval $shlib_path_name=$old_value:$SGE_ROOT/lib/$ARCH
   fi
   export $shlib_path_name
   ;;
esac

#---------------------------------------------------------------------------
# Shutdown
# Send SIGTERM to process name $1 with pid in file $2
#
Shutdown()
{
   name=$1
   pidfile=$2
   if [ -f $pidfile ]; then
      pid=`cat $pidfile`
      maxretries=20
      i=0
      while [ $i -lt $maxretries ]; do
         $utilbin_dir/checkprog $pid $name > /dev/null
         if [ "$?" = 0 ]; then
            #We keep killing Qmaster so that child processes get killed
            kill $pid
         else
            return 0
         fi
         sleep 2
         i=`expr $i + 1`

      done
      kill -9 $pid
      return $?
   fi
}


#---------------------------------------------------------------------------
# QmasterSpoolDir
#    Return qmasters spool directory
#
QmasterSpoolDir()
{
   qma_spool_dir=`grep qmaster_spool_dir \
                      $SGE_ROOT/$SGE_CELL/common/bootstrap | \
                      awk '{ print $2 }'`
   echo $qma_spool_dir
}




#---------------------------------------------------------------------------
# CheckIfQmasterHost
#    If our hostname given in $1 is the same as in the "act_qmaster" file
#    echo "true" else echo "false"
#
CheckIfQmasterHost()
{
   host=$1

   if [ "$host" = "`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`" ]; then
      echo true
   else
      echo false
   fi
}

#---------------------------------------------------------------------------
# CheckIfPrimaryQmasterHost
#    Check if our hostname given in $1 is the same as in the
#    "primary_qmaster" file
#    echo true if there is our hostname else echo false
#
CheckIfPrimaryQmasterHost()
{
   host=$1

   fname=$SGE_ROOT/$SGE_CELL/common/primary_qmaster

   if [ -f $fname ]; then
      if [ "$host" = "`cat $fname`" ]; then
         echo true
      else
         echo false
      fi
   else
      echo false
   fi
}


#---------------------------------------------------------------------------
# CheckIfShadowMasterHost
#    Check if our hostname given in $1 is contained in the
#    "shadow_masters" file
#    echo true if there is our hostname else echo false
#
CheckIfShadowMasterHost()
{
   host=$1

   fname=$SGE_ROOT/$SGE_CELL/common/shadow_masters

   if [ -f $fname ]; then
      grep -i $host $fname 2>&1 > /dev/null
      if [ $? = 0 ]; then
         shadow_host="true"
      else
         shadow_host="false"
      fi
   else
      shadow_host="false"
   fi
}

#---------------------------------------------------------------------------
# GetPathToBinaries
#    echo the name of the bin_dir on this system
#    The check is fullfilled if we can access the qstat binary
#    echo "none" if we can't determine the binary path
GetPathToBinaries()
{
   cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap

   base=none

   if [ -f $cfgname ]; then
      base=`grep binary_path $cfgname | awk '{ print $2 }'`
      if [ -f $base/qstat ]; then
         :
      elif [ -f $SGE_ROOT/util/arch ]; then
         arch=`$SGE_ROOT/util/arch`
         if [ -f $base/$arch/qstat ]; then
               base=$base/$arch
         fi
      fi
   fi

   echo $base
}


#---------------------------------------------------------------------------
# GetAdminUser
#    echo the name of the admin user on this system
#    echo "root" if admin user retrieval fails
GetAdminUser()
{
   cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap
   user=none

   if [ -f $cfgname ]; then
      user=`grep admin_user $cfgname | awk '{ print $2 }'`
   fi

   if [ `echo $user|tr "[A-Z]" "[a-z]"` = "none" ]; then
      user=root
   fi
   echo $user
}

#---------------------------------------------------------------------------
# GetPathToUtilbin
#    echo the path to the binaires in utilbin
#    The check is fullfilled if we can access the "gethostname" binary
#    echo "none" if we can't determine the binary path
#
GetPathToUtilbin()
{
   base=none

   if [ -f $SGE_ROOT/util/arch ]; then
      utilbindir=$SGE_ROOT/utilbin

      arch=`$SGE_ROOT/util/arch`
      if [ -f $utilbindir/$arch/gethostname ]; then
         base=$utilbindir/$arch
      fi
   fi

   echo $base
}

#---------------------------------------------------------------------------
# CheckRunningQmaster
# checks, if sge_qmaster is running
# In error case the sge_qmaster didn't start, silently
#
CheckRunningQmaster()
{
   masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
   running=false
   loop=0

   if [ "$SGE_QMASTER_PORT" = "" ]; then
      SGE_QMASTER_PORT=`$utilbin_dir/getservbyname -number sge_qmaster`
   fi

   while [ $running = "false" -a $loop -ne 30 ]; do 
      $bin_dir/qping -info $masterhost $SGE_QMASTER_PORT qmaster 1 > /dev/null 2>&1

      if [ "$?" = 0 ]; then
         running=true
      else
         sleep 2 
         masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
         loop=`expr $loop + 1`
      fi
   done

   if [ $running = "false" ]; then
      echo
      echo "sge_qmaster didn't start!"
      echo "Please check the messages file"
      echo
   fi
}

#---------------------------------------------------------------------------
# DetectSMFService - sets service to a mask maching the name
# $1 ... name
#
DetectSMFService()
{
   name=$1
   service=""

   if [ "$noSMF" = true ]; then
      return
   fi
   #Otherwise we try is it's available of the system
   if [ -f /lib/svc/share/smf_include.sh ]; then
      . /lib/svc/share/smf_include.sh
      smf_present
      if [ $? -ne 0 ]; then
         return
      fi
   else
      return
   fi

   #Check we have cluster_name file
   if [ ! -r "$SGE_ROOT/$SGE_CELL/common/cluster_name" ]; then
      echo "Error: could not find $SGE_ROOT/$SGE_CELL/common/cluster_name!"
      exit $SMF_EXIT_ERR_CONFIG
   fi
   #Cluster name must be unique
   SGE_CLUSTER_NAME=`cat $SGE_ROOT/$SGE_CELL/common/cluster_name 2>/dev/null`
   
   service="svc:/application/sge/$name:$SGE_CLUSTER_NAME"

   if [ -n "$SMF_FMRI" -a "$service" != "$SMF_FMRI" ]; then
      #Someone is trying to fool us
      echo "Error: $service does not match FMRI $SMF_FMRI provided by SMF!"
      echo "Check that service instance $SGE_CLUSTER_NAME exist on your system"
      exit $SMF_EXIT_ERR_CONFIG
   fi
   #Check if service exists
   /usr/bin/svcs $service > /dev/null 2>&1
   if [ $? -ne 0 ]; then
      #No such service found in the system
      service=""
   fi
}


#---------------------------------------------------------------------------
usage()
{
   echo "Grid Engine start/stop script. Valid parameters are:"
   echo ""
   echo "   (no parameters): start qmaster and shadow daemon if applicable"
   echo "   \"start\"        dto."
   echo "   \"stop\"         shutdown local Grid Engine processes and jobs"
   echo "   \"-qmaster\"     only start/stop qmaster (if applicable)"
   echo "   \"-shadowd\"     only start/stop shadowd (if applicable)"   
   echo "   \"-migrate\"     shutdown qmaster if it's running on another"
   echo "                    host and restart it on this host"
   echo "                    Migration only works if this host is an admin host"
   echo "   \"-nosmf\"       force no SMF"
   echo ""
   echo "Only one of the parameters \"start\", \"stop\" or \"softstop\" is allowed."
   echo "Only one of the parameters beginning  with \"-\" is allowed. Does not " 
   echo "apply to -nosmf."
   echo
   echo "Default argument is \"start\" for all components."
   echo "Default for \"stop\" is shutting down all components."
   echo
   exit 1
}


#---------------------------------------------------------------------------
# MAIN Procedure
#

if [ "$#" -gt 3 -o "$1" = "-h" -o "$1" = "help" ]; then
   usage
fi

startup=true
qmaster=true
shadowd=true
qstd=false
migrate_qmaster=false
softstop=false
noSMF=false

for i in $*; do
   if [ "$i" = start ]; then
      startup=true
   elif [ "$i" = stop ]; then
      startup=false
   elif [ "$i" = softstop ]; then
      startup=false
      softstop=true
   elif [ "$i" = -qmaster ]; then
      qmaster=true
      shadowd=false
   elif [ "$i" = -shadowd ]; then
      qmaster=false
      shadowd=true
   elif [ "$i" = -migrate ]; then
      migrate_qmaster=true
      qmaster=true
      shadowd=false
   elif [ "$i" = -nosmf ]; then
      noSMF=true
   else
      usage
   fi
done

bin_dir=`GetPathToBinaries`
if [ "$bin_dir" = "none" ]; then
   echo "can't determine path to Grid Engine binaries"
   exit 1
fi

utilbin_dir=`GetPathToUtilbin`
if [ "$utilbin_dir" = "none" ]; then
   echo "can't determine path to Grid Engine utility binaries"
   exit 1
fi

HOST=`$utilbin_dir/gethostname -aname`
UQHOST=`$utilbin_dir/gethostname -aname | cut -f1 -d.`
qmaster_spool_dir=`QmasterSpoolDir`
CheckIfShadowMasterHost $HOST

if [ "$startup" = true ]; then

   # if service tags are enabled
   st_enabled=`sh $SGE_ROOT/util/sgeST/sge_st "enabled"`
   if [ "$st_enabled" = "true" -a "$qmaster" = "true" ]; then
      sh $SGE_ROOT/util/sgeST/sge_st "register"  
   fi

   # qmaster_host=true if qmaster was running on this host the last time
   #                   this host is an execution host

   qmaster_host=`CheckIfQmasterHost $HOST`
   primary_qmaster_host=`CheckIfPrimaryQmasterHost $HOST`

   if [ $qmaster = true -a $qmaster_host = true -a $migrate_qmaster = true ]; then
      echo "   qmaster running on this host. Will not migrate qmaster."
      exit 1
   fi

   if [ $qmaster = true -a $qmaster_host = false -a  \
        \( $primary_qmaster_host = true -o $migrate_qmaster = true \) ]; then
       actual_qmaster_host=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`
       echo "   shutting down qmaster on host \"$actual_qmaster_host\" ..."
       qconf_output=`$bin_dir/qconf -ks 2>&1 | grep "denied"`
       if [ "$qconf_output" != "" ]; then
          echo "   denied: host \"$HOST\" is no admin host."
          exit 1
       fi
       $bin_dir/qconf -km > /dev/null 2>&1
       
       qping_count=0
       qping_retries=10
       qping_exit_state=0
       while [ $qping_count -lt $qping_retries ]; do
          $bin_dir/qping -info $actual_qmaster_host $SGE_QMASTER_PORT qmaster 1  > /dev/null 2>&1
          qping_exit_state=$?
          if [ $qping_exit_state -ne 0 ]; then
             break
          fi
          sleep 3
          qping_count=`expr $qping_count + 1`
       done

       if [ $qping_exit_state -eq 0 ]; then
       #  qmaster is still running
          echo "   qmaster on host $actual_qmaster_host still alive. Cannot migrate qmaster."
          exit 1
       fi

       lock_file_read_retries=10
       lock_file_read_count=0
       lock_file_found=0
       while [ $lock_file_read_count -lt $lock_file_read_retries ]; do
          if [ -f $qmaster_spool_dir/lock ]; then
             lock_file_found=1
             break
          fi
          sleep 3
          lock_file_read_count=`expr $lock_file_read_count + 1`
       done

       if [ $lock_file_found -eq 0 ]; then
       #  old qmaster did not write lock file 
          echo "   old qmaster did not write lock file. Cannot migrate qmaster."
          echo "   Please verify that qmaster on host $actual_qmaster_host is down"
          echo "   and make sure that the lock file in qmaster spool directory is"
          echo "   read-able."
          exit 1
       fi

       qmaster_host=true
   fi

   if [ $qmaster = true ]; then
      DetectSMFService qmaster
      #We want to use smf
      if [ -z "$SMF_FMRI" -a -n "$service"  ]; then
         echo "   starting sge_qmaster"
         svcadm enable -st $service
      #For -migrate with SMF qmaster_host is not yet set for SMF start (2nd)
      elif [ $qmaster_host = true -o \( -n "$SMF_FMRI" -a "$SMF_FMRI" = "$service" \) ]; then
         echo "   starting sge_qmaster"
         $bin_dir/sge_qmaster
         [ $? -eq 0 -a -d /var/lock/subsys ] && touch /var/lock/subsys/sgemaster >/dev/null 2>&1
         CheckRunningQmaster
      fi
   elif [ $qmaster = true -a $qmaster_host = false ]; then
      echo
      echo "sge_qmaster didn't start!"
      echo "This is not a qmaster host!"
      echo "Please, check your act_qmaster file!" 
      echo
   fi
      
   if [ $shadowd = true -a $shadow_host = true ]; then
      start_shadowd=true
      UQpidfile=$qmaster_spool_dir/shadowd_$UQHOST.pid
      pidfile=$qmaster_spool_dir/shadowd_$HOST.pid

      if [ -f $pidfile ]; then
         pid=`cat $pidfile`
         $utilbin_dir/checkprog $pid sge_shadowd > /dev/null
         if [ "$?" = 0 ]; then
            start_shadowd=false 
         fi
      fi

      if [ -f $UQpidfile ]; then
         pid=`cat $UQpidfile`
         $utilbin_dir/checkprog $pid sge_shadowd > /dev/null
         if [ "$?" = 0 ]; then
            start_shadowd=false
         fi
      fi

      if [ $start_shadowd = true ]; then
         DetectSMFService shadowd
         echo "   starting sge_shadowd"
         #We want to use smf
         if [ -z "$SMF_FMRI" -a -n "$service"  ]; then
            svcadm enable -st $service
         else
            $bin_dir/sge_shadowd
	 fi
      else
	 echo "   found running sge_shadowd - not starting"
         exit 1
      fi
   fi
else
   if [ $shadowd = true -a $shadow_host = true ]; then
      echo "   Shutting down Grid Engine shadowd"
      DetectSMFService shadowd
      if [ -z "$SMF_FMRI" -a -n "$service"  ]; then
         svcadm disable -st $service
      else
         # Send SIGTERM to shadowd
         if [ -f $qmaster_spool_dir/shadowd_$UQHOST.pid ]; then
            Shutdown sge_shadowd $qmaster_spool_dir/shadowd_$UQHOST.pid
         elif [ -f $qmaster_spool_dir/shadowd_$HOST.pid ]; then
            Shutdown sge_shadowd $qmaster_spool_dir/shadowd_$HOST.pid
         fi	
      fi
   fi

   if [ $qmaster = true ]; then
      if [ `CheckIfQmasterHost $HOST` = true ]; then
         echo "   Shutting down Grid Engine qmaster"
         DetectSMFService qmaster
         if [ -z "$SMF_FMRI" -a -n "$service"  ]; then
            svcadm disable -st $service
            exit $?
         else
            # Send SIGTERM to qmaster
            Shutdown sge_qmaster $qmaster_spool_dir/qmaster.pid
            ret=$?
            if [ -f /var/lock/subsys/sgemaster ]; then
               uid=`$utilbin_dir/uidgid -uid`
               if [ "$uid" = "0" -a "$ret" = "0" ]; then            
                  rm -f /var/lock/subsys/sgemaster >/dev/null 2>&1
               else
                  echo "Can't shut down qmaster!"
                  exit 1
               fi
            fi
         fi
      fi
   fi

fi
