#!/usr/bin/env perl

##**************************************************************
##
## Copyright (C) 1990-2007, Condor Team, Computer Sciences Department,
## University of Wisconsin-Madison, WI.
## 
## Licensed under the Apache License, Version 2.0 (the "License"); you
## may not use this file except in compliance with the License.  You may
## obtain a copy of the License at
## 
##    http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
##
##**************************************************************



# User modifiable settings:

	# Who get mail from problems on remote machines ("Machine B").  If
	# commented out, defaults to the same user who gets mail from problems on
	# the local Condor installations (CONDOR_ADMIN)
my $config_condor_admin;
#$config_condor_admin = 'prelz@mi.infn.it';
#$config_condor_admin = 'adesmet@cs.wisc.edu';
chomp($config_condor_admin = `condor_config_val CONDOR_ADMIN`);

	# Where is your condor_collector running?  If commented out, defaults
	# to CONDOR_HOST as set in the local condor_config.
my $config_condor_host;
chomp( $config_condor_host = `hostname -f` );
if($condor_config_host eq '') {
	chomp( $config_condor_host = `hostname` );
}


	# If 1, then schedds have (effectively) random names.  This is
	# useful for doing dynamic matchmaking against schedds, but doesn't
	# work if you're specifying the remote schedd in your submit file.
my $DYNAMIC_SCHEDD_NAMES = 1;

# End user modifiable settings.




# Here is the layout of the main data structure:
# %job_groups{'sitename#uniquename'} =
#    { uniquename       => "..."
#      proxyfile        => "..."
#      sitename         => "..."
#      sitegatekeeper   => "..."
#      scheddjobid      => "..." #undefined until schedd job submitted
#      scheddstatus     => ...
#      numjobs          => ...
#    }


# Debug level. 0 is basically silent.
my $DEBUG_LEVEL;
my $LOG_DIR;

chomp( $DEBUG_LEVEL = int ( `condor_config_val GLITE_CONDORC_DEBUG_LEVEL` ) );
if ( $? != 0 ) {
	$DEBUG_LEVEL = 10;
}

chomp( $LOG_DIR = `condor_config_val GLITE_CONDORC_LOG_DIR` );
if ( $? != 0 || ! -d $LOG_DIR ) {
	$LOG_DIR = "/tmp";
}

# Set a longer timeout for our condor commands
$ENV{_condor_TOOL_TIMEOUT_MULTIPLIER} = "4";

# initialize variables that will be used throughput our run
if(not defined $config_condor_host) {
	chomp( $config_condor_host = `condor_config_val CONDOR_HOST` );
}
$base_dir = "\$(HOME)/Condor_glidein";
#$config_local_dir = "\$(HOME)/Condor_glidein/local";
#$config_sbin = "\$(HOME)/Condor_glidein/sbin";
if(not defined $config_condor_admin) {
	chomp( $config_condor_admin = `condor_config_val CONDOR_ADMIN` );
}
$config_mail = "/usr/bin/mail";

# This invocation of open() will spawn a shell, so it normally can't fail.
# Therefore, we must check the close() for any failures.
debug_print(4, "Startup: Scanning queue for existing schedds\n");
# scan the queue for any schedd jobs already submitted
# format: <Cluster>.<Proc> <x509proxy> <uniquename> <sitename> <sitegatekeeper> <job status>
open( CQ, 'condor_q -const "DynamicScheddJob=?=True" -format "%d." ClusterId -format "%d\t" ProcId -format "%s\t" X509UserProxy -format "%s\t" daemon_unique_name -format "%s\t" SiteName -format "%s\t" GridResource -format "%d" JobStatus -format "\n" ClusterId |' ) || die( "Can't run condor_q" );

while ( <CQ> ) {

	chomp;

	@line = split( /\t/ );

	if ( $#line == 5 ) {

		my $key = "${line[3]}#${line[2]}";

		if ( defined $job_groups{$key} ) {
			debug_print(5, "Two schedd jobs for same site,subject! " .
					"Removing old job ${$job_groups{$key}}{'scheddjobid'}\n");
			system( "condor_rm ${$job_groups{$key}}{'scheddjobid'}" );
		}

		# Remove 'gt2 ' from the beginning of GridResource
		$line[4] =~ s/gt2 //;

		$job_groups{$key} = {
			'uniquename'    => "${line[2]}",
			'proxyfile'       => "${line[1]}",
			'sitename'        => "${line[3]}",
			'sitegatekeeper'  => "${line[4]}",
			'scheddjobid'     => "${line[0]}",
			'scheddstatus'    => $line[5],
			'numjobs'         => 0
			};
	}
}

# If the exit status of condor_q is 32512 (exited with status 127), the
# probable cause is that the shell couldn't find condor_q in the path.
# We want to die in that case, but not if condor_q itself returns non-zero.
close( CQ ) || $? != 32512 || die( "Can't run condor_q" );

while ( 1 ) { # or until no user jobs left <-- Is this still true?  I think not.
	debug_print(3, "###########################\n");

	# check queue for user jobs
	##########################################
	foreach $group ( values %job_groups ) {
		${$group}{'numjobs'} = 0;
	}

	debug_print(4, "Scanning for grid jobs\n");
	# Need to catch SIGCHLD to know if command failed
	open( CQ, q[condor_q -const 'JobUniverse==9&&JobGridType=?="condor"' -format "%s\t" daemon_unique_name -format "%s\t" x509userproxy -format "%s\t" SiteName -format "%s" SiteGatekeeper -format "\n" ClusterId |] ) || die( "Can't run condor_q" );

	while ( <CQ> ) {

		chomp;
		next if /^$/;
		debug_print(4, "    Found one ($_)\n");

		@line = split( /\t/ );

		if ( $#line == 3 ) {

			my $key = "${line[2]}#${line[0]}";

			if ( defined $job_groups{$key} ) {
				${$job_groups{$key}}{'numjobs'}++;
				# Set the proxy file here because we may not have a valid
				# one yet: if hash for this group was created from schedd
				# job info (first scan in startup), proxyfile is one
				# specific to that schedd job that can disappear with it
				${$job_groups{$key}}{'proxyfile'} = "${line[1]}";
			} else {
				$job_groups{$key} = {
					'uniquename'      => "${line[0]}",
					'proxyfile'       => "${line[1]}",
					'sitename'        => "${line[2]}",
					'sitegatekeeper'  => "${line[3]}",
					'numjobs'         => 1
					};
			}
		}

	}

	# If we can't get the list of jobs, when we have nothing to act on.
	# Try again later.
	if ( !close( CQ ) ) {
		debug_print( 1, "condor_q of user jobs timed out\n" );
		next;
	}

	# check queue for schedd jobs (see if any have disappeared)
	##########################################
	foreach $group ( values %job_groups ) {
		${$group}{'scheddstatus'} = undef();
		${$group}{'scheddjobid'} = undef();
	}

	# Should we exclude schedd jobs in X state?
	# Need to catch SIGCHLD to know if command failed
	open( CQ, 'condor_q -const "DynamicScheddJob=?=True" -format "%d." ClusterId -format "%d\t" ProcId -format "%s\t" daemon_unique_name -format "%s\t" SiteName -format "%d" JobStatus -format "\n" ClusterId |' ) || die( "Can't run condor_q" );

	while ( <CQ> ) {

		chomp;

		@line = split( /\t/ );

		if ( $#line == 3 ) {

			my $key = "${line[2]}#${line[1]}";

			if ( defined $job_groups{$key} ) {
				${$job_groups{$key}}{'scheddstatus'} = $line[3];
				${$job_groups{$key}}{'scheddjobid'} = $line[0];
			} else {
				debug_print( 1, "Unknown schedd job for group $key\n" );
			}
		}

	}

	# If we can't get the list of jobs, then we have nothing to act on.
	# Try again later.
	if ( !close( CQ ) ) {
		debug_print( 1, "condor_q of launcher jobs timed out\n" );
		next;
	}

	# submit schedds as needed
	##########################################
	debug_print(3, "Submit schedds as needed\n");
	foreach $group ( values %job_groups ) {

		my $key = "${$group}{sitename}#${$group}{uniquename}";

		if ( ${$group}{'numjobs'} > 0 ) {
			if ( !defined( ${$group}{'scheddjobid'} ) ) {
				debug_print(1, "    group $key has no schedd, starting one\n");
				${$group}{'scheddjobid'} = submit_schedd_job( $group );
			} else {
				debug_print(4, "    Group $key already has a schedd\n");
			}
			if ( defined( ${$group}{'scheddjobid'} ) ) {
				${$group}{'scheddsubmitted'} = 1;
			}
		}

	}

	# do upkeep on all our schedd jobs
	##########################################
	debug_print(5, "Upkeep\n");
	foreach $group ( values %job_groups ) {

		my $key = "${$group}{sitename}#${$group}{uniquename}";
		my $line = join( ",", values( %{$group} ) );
		debug_print(5, "    $line\n");

		if ( ${$group}{'numjobs'} == 0 ) {
			debug_print(5, "        Appears healthy but unused\n");
			# condor_rm?
		} elsif ( ${$group}{'scheddstatus'} == 5 ) {
			debug_print(5, "        Is held.  Removing.\n");
			system( "condor_rm ${$group}{'scheddjobid'}" );
			my $reason;
			chomp( $reason = `condor_q ${$group}{'scheddjobid'} -format "%d " HoldReasonCode -format "%d\n" HoldReasonSubCode` );
			if ( $reason == "2 131" ) {
				debug_print(5,"        Proxy expiring, force removing.\n");
				system( "condor_rm -f ${$group}{'scheddjobid'}" );
			}
		} elsif ( ${$group}{'scheddstatus'} == 3 ) {
			debug_print(5, "        Is removed. Ignoring.\n");
		} else {
			debug_print(5, "        Appears healthy and used\n");
#			advertise_schedd( $group );
			# check for held
			# update proxy?
		}
	}
} continue {
	if(getppid() == 1) {
		debug_print(9, "My parent disappeared.  Assuming he exitted.  Quitting\n");
		exit(1);
	}

	sleep( 30 );
}

sub submit_schedd_job
{
	my( $group ) = @_;
	my $cmd_file = "/$LOG_DIR/condorc-launcher-submit.$$";
	my $submit_out = "/$LOG_DIR/condorc-launcher-submit-out.$$";
	my $cluster;
	my $env;
	my $setup_script = "/$LOG_DIR/condorc-launcher-starter.$$";
	my $mapfile = "condorc-launcher-mapfile.$$";
	my $schedd_name;
	my $exec;
	my $args;
	my $job_x509_subject;
	my $my_x509_cert;
	my $my_x509_subject;

	chomp( $job_x509_subject = `openssl x509 -in $group->{proxyfile} -subject -noout` );
	$job_x509_subject =~ s/^subject *= *//;
	$job_x509_subject =~ s|/CN=proxy||g;

	chomp( $my_x509_cert = `condor_config_val GSI_DAEMON_CERT` );
	chomp( $my_x509_subject = `openssl x509 -in $my_x509_cert -subject -noout` );
	$my_x509_subject =~ s/^subject *= *//;


	debug_print(2, "Submitting schedd to $group->{sitegatekeeper} with $group->{proxyfile}\n");

	if ( system(  "grid-proxy-info -exists -file $group->{proxyfile} >/dev/null 2>&1" ) != 0 ) {
		debug_print(2, "Proxy $group->{proxyfile} is invalid, not submitting schedd job\n");
		return undef;
	}

	# Use the daemon_unique_name we found in the job ad to use as a
	# schedd name. All jobs with the same uniquename go to the same
	# dynamically-started schedd.
	$schedd_name = $group->{'uniquename'};

	# Write the setup script that will run on the remote machine
	open( SCRIPT, ">$setup_script" ) || die "error opening $setup_script";

	print SCRIPT <<EOF;
#! /bin/sh

#setup script
# usage: setup <args to condor_master>

# Stupid globus bug 1486 means we can't use variable substitution in the
# job's arguments (if there's more than one argument). So we can't pass
# the path to the condor_master in the arguments. Instead, we'll just
# hard-code it in this script for now.
#master=\$1
#shift

BASEDIR=\$1
shift

for newdir in \$BASEDIR \$_condor_LOCAL_DIR \$_condor_LOCAL_DIR/log \$_condor_LOCAL_DIR/spool; do
   if [ ! -d \$newdir ]; then
        /bin/mkdir -p \$newdir;
    fi
    if [ ! -d \$newdir ]; then
        echo "ERROR mkdir -p \$newdir"
        exit 1;
    fi
done


my_condor_master=\$BASEDIR/sbin/condor_master
if [ ! -x \$my_condor_master ]; then
	my_condor_master=/opt/condor-c/sbin/condor_master
	if [ "x\$CONDOR_MASTER_LOCATION" != "x" ]; then
		my_condor_master=\$CONDOR_MASTER_LOCATION
	fi
fi

if [ "x\$CONDOR_CONFIG" == "x" ]; then
	LOCALGUESS=\$HOME/Condor_glidein/condor_config.submit
	if [ -r \$LOCALGUESS ]; then
		CONDOR_CONFIG=\$LOCALGUESS
	fi
fi

if [ "x\$CONDOR_CONFIG" == "x" ]; then
	CONDOR_CONFIG=/opt/condor-c/etc/condor_config
fi

export CONDOR_CONFIG
echo "Using configuration file \$CONDOR_CONFIG"

if [ ! -x \$my_condor_master ]; then
	echo "ERROR finding condor_master as \$my_condor_master"
	exit 1
fi

# This assumes globus-user-env.sh and grid-proxy-info are set up correctly
# on the remote headnode.
. \$GLOBUS_LOCATION/etc/globus-user-env.sh
proxytime=`\$GLOBUS_LOCATION/bin/grid-proxy-info -timeleft`
runfor=`/usr/bin/expr \$proxytime / 60 - 5`

# We need to duplicate X509_USER_PROXY because Condor's daemon-core clears
# it from the environment before param()ing to see what file it should
# use for it's X509 credentials.
X509_DUPLICATE_USER_PROXY=\$X509_USER_PROXY
export X509_DUPLICATE_USER_PROXY 

_condor_GRIDMAP=`pwd`/$mapfile
export _condor_GRIDMAP

# Write the grid-mapfile
my_username=`/usr/bin/whoami`

_condor_QUEUE_SUPER_USERS=\$my_username
export _condor_QUEUE_SUPER_USERS

echo "\\\"$job_x509_subject\\\"" \$my_username > \$_condor_GRIDMAP
echo "\\\"$my_x509_subject\\\"" \$my_username >> \$_condor_GRIDMAP

exec \$my_condor_master -f -r \$runfor "\$@"

EOF

	close( SCRIPT );

	# For now, we assume that binaries and a bare-bones config file
	# (ala glidein) are pre-installed at the execution site

	$env .= "_condor_CONDOR_HOST=$config_condor_host;";

	$env .= "_condor_LOCAL_DIR=$base_dir/local.$schedd_name;";

	$env .= "_condor_CONDOR_ADMIN=$config_condor_admin;";

	$env .= "_condor_MAIL=$config_mail;";


	$env .= "_condor_GSI_DAEMON_NAME=$my_x509_subject;";

	if($DYNAMIC_SCHEDD_NAMES) {
		$env .= "_condor_MASTER_NAME=$schedd_name;";
		$env .= "_condor_SCHEDD_NAME=$schedd_name;";
	}

	$env .= "_condor_SCHEDD_ATTRS=SiteName,daemon_unique_name,CONDORC_WANTJOB,GLITE_ENV;";
	$env .= "_condor_SiteName=\"${$group}{'sitename'}\";";
	$env .= "_condor_daemon_unique_name=\"${$group}{'uniquename'}\";";
	$env .= "_condor_CONDORC_WANTJOB=TRUE;";

	chop $env;

#	$exec = "$config_sbin/condor_master";
	$exec = $setup_script;

	$args = $base_dir;

	# Escape '$' in submit file entries
	$env =~ s/\$/\$(DOLLAR)/g;
	$args =~ s/\$/\$(DOLLAR)/g;
	$exec =~ s/\$/\$(DOLLAR)/g;

	# Stupid globus bug 1486 means we can't use variable substitution in the
	# job's arguments (if there's more than one argument. So we can't pass the
	# path to the condor_master in the arguments. Instead, we'll just
	# hard-code it in this script for now.
	open( CMD, ">$cmd_file" ) || die( "Can't open $cmd_file" );
	print CMD <<EOF;
# run a schedd on the remote resource
universe = grid
grid_resource = gt2 ${$group}{'sitegatekeeper'}
executable = $exec
arguments = $args
environment = $env
output = /$LOG_DIR/condorc-launcher.out.\$(cluster).\$(process)
error = /$LOG_DIR/condorc-launcher.err.\$(cluster).\$(process)
log = /$LOG_DIR/condorc-launcher.log.\$(cluster).\$(process)
x509userproxy = ${$group}{'proxyfile'}
should_transfer_files=YES
when_to_transfer_output=ON_EXIT
+DynamicScheddJob = True
+SiteName = "${$group}{'sitename'}"
+daemon_unique_name = "${$group}{'uniquename'}"
notification=NEVER
leave_in_queue=False
queue
EOF

	close( CMD );


	my $submit_exitcode;
	$submit_exitcode = system( "condor_submit -s $cmd_file >$submit_out 2>&1" );
	if ($submit_exitcode != 0) {
		debug_print( 1, "Error running condor_submit" );
		return undef;
	}

    # snarf the cluster id from condor_submit's output
	unless( open( SUBMIT, "<$submit_out" ) ) {
		debug_print( 1, "error opening \"$submit_out\": $!\n" );
		return undef;
	}
	while( <SUBMIT> ) {
		if( /\d+ job\(s\) submitted to cluster (\d+)./ ) {
			$cluster = $1;
			last;
		}
	}
	close( SUBMIT );

	# if for some reason we didn't find the cluster id 
	unless( $cluster ) {
		debug_print( 1, "error: couldn't find cluster id in condor_submit output\n" );
		return undef;
	}

	# Since the schedd doesn't do a reschedule when a job's input files are
	# spooled, give it a kick in the butt.
	system( "condor_reschedule" );

	# Turn the cluster number into a fully-qualified condor job id.
	$cluster .= ".0";

	unlink $cmd_file, $submit_out, $setup_script;

	return $cluster;
}

sub advertise_schedd
{
	my( $group ) = @_;
	my $ad_file = "/$LOG_DIR/ad_file.$$";

	open( AD, ">$ad_file" ) || die "Failed to open file $ad_file";

	print AD <<EOF;
EOF

	close( AD );

	$rc = system( "condor_advertise UPDATE_STARTD_AD $ad_file" );
	if ( $rc != 0 ) {
		debug_print( 1, "condor_advertise exitted with $rc\n" );
	}
}


sub debug_print {
	my $level = shift;
	if($level > $DEBUG_LEVEL) { return; }
	print tersedate();
	print " ";
	print @_;
}

sub tersedate {
	my ($sec,$min,$hour,$mday,$mon,$year) = localtime(time);
	return sprintf "%04d-%02d-%02d %02d:%02d:%02d",
	 	$year + 1900, $mon + 1, $mday, $hour, $min, $sec;
}
