#!/bin/sh
##**************************************************************
##
## Copyright (C) 1990-2007, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
## 
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
## 
##    http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************


_CONDOR_PROCNO=$_CONDOR_PROCNO
_CONDOR_NPROCS=$_CONDOR_NPROCS
_CONDOR_REMOTE_SPOOL_DIR=$_CONDOR_REMOTE_SPOOL_DIR

SSHD_SH=`condor_config_val libexec`
SSHD_SH=$SSHD_SH/sshd.sh

CONDOR_SSH=`condor_config_val libexec`
CONDOR_SSH=$CONDOR_SSH/condor_ssh

. $SSHD_SH $_CONDOR_PROCNO $_CONDOR_NPROCS 

# If not the head node, just sleep forever, to let the
# sshds run
if [ $_CONDOR_PROCNO -ne 0 ]
then
		wait
		sshd_cleanup
		exit 0
fi

EXECUTABLE=$1
shift

# the binary is copied but the executable flag is cleared.
# so the script have to take care of this
chmod +x $EXECUTABLE

# Set this to the bin directory of your lam installation
# This also must be in your .cshrc file, so the remote side
# can find it!
LAMDIR=/u/g/t/gthain/lam-7.0.6/bin
PATH=$LAMDIR:$PATH
export PATH

# to allow multiple lam jobs running on a single machine,
# we have to give somewhat unique value
export LAM_MPI_SESSION_SUFFIX=$$

# This is the way to accomplish the above when running 
# LAM < 7.0.2
export LAM_MPI_SOCKET_SUFFIX=$$

export LAMRSH=$CONDOR_SSH

# when a job is killed by the user, this script will get sigterm
# This script have to catch it and do the cleaning for the
# lam environment
finalize()
{
sshd_cleanup
lamhalt
exit
}
trap finalize TERM

CONDOR_CONTACT_FILE=$_CONDOR_SCRATCH_DIR/contact
export CONDOR_CONTACT_FILE

# The second field in the contact file is the machine name
# that condor_ssh knows how to use
sort -n +0 < $CONDOR_CONTACT_FILE | awk '{print $2}' > machines


# start the lam environment
# For older versions of lam you may need to remove the -ssi boot rsh line
lamboot -ssi boot rsh machines

if [ $? -ne 0 ]
then
	echo "lamscript error booting lam"
	exit 1
fi

## run the actual mpijob
mpirun C $EXECUTABLE $@ &

CHILD=$!
TMP=130
while [ $TMP -gt 128 ] ; do
	wait $CHILD
	TMP=$?;
done

# clean up files
sshd_cleanup
/bin/rm -f machines

# clean up lam
lamhalt

exit $TMP
