#!/bin/sh
# set -x
#
#  Based on globus submission script for pbs
#
#  Submits job to HTCondor.
#  Input: path to grami file (same as Globus).
#
# The temporary job description file is created for the submission and then removed
# at the end of this script.

echo "----- starting submit_condor_job -----" 1>&2
joboption_lrms="condor"
lrms_options="condor_requirements condor_rank condor_bin_path condor_config"
queue_options="condor_requirements"


# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
GRAMI_FILE=$1

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common submit functions
. "${pkgdatadir}/submit_common.sh" || exit $?

# run common init
#  * parse grami
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

perflogfilesub="${perflogdir}/submission.perflog"

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
   RUNTIME_LOCAL_SCRATCH_DIR="\${_CONDOR_SCRATCH_DIR}"
fi

# check remote or local scratch is configured
check_any_scratch

##############################################################
# Zero stage of runtime environments
##############################################################
RTE_stage0

##############################################################
# create job script
##############################################################
mktempscript

is_cluster=true
##############################################################
# Start job description file
##############################################################

CONDOR_SUBMIT='condor_submit'
if [ ! -z "$CONDOR_BIN_PATH" ] ; then
  CONDOR_SUBMIT=${CONDOR_BIN_PATH}/${CONDOR_SUBMIT}
fi

# HTCondor job script and submit description file
rm -f "$LRMS_JOB_SCRIPT"
LRMS_JOB_SCRIPT="${joboption_directory}/condorjob.sh"
LRMS_JOB_DESCRIPT="${joboption_directory}/condorjob.jdl"

echo "# HTCondor job description built by arex" > $LRMS_JOB_DESCRIPT
echo "Executable = condorjob.sh" >> $LRMS_JOB_DESCRIPT
echo "Input = $joboption_stdin" >> $LRMS_JOB_DESCRIPT
echo "Log = ${joboption_directory}/log">> $LRMS_JOB_DESCRIPT

# write HTCondor output to .comment file if possible, but handle the situation when
# jobs are submitted by HTCondor-G < 8.0.5
condor_stdout="${joboption_directory}.comment"
condor_stderr="${joboption_directory}.comment"
if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
#   if [[ $joboption_stdout =~ _condor_stdout$ ]]; then
   if expr match "$joboption_stdout" '.*_condor_stdout$' > /dev/null; then
      condor_stdout=$joboption_stdout;
      condor_stderr=$joboption_stderr;
   fi
fi
echo "Output = $condor_stdout">> $LRMS_JOB_DESCRIPT
echo "Error = $condor_stderr">> $LRMS_JOB_DESCRIPT

# queue
if [ ! -z "${joboption_queue}" ] ; then
    echo "+NordugridQueue = \"$joboption_queue\"" >> $LRMS_JOB_DESCRIPT
fi

# job name for convenience
if [ ! -z "${joboption_jobname}" ] ; then
    #TODO is this necessary? do parts of the infosys need these limitations?
  jobname=`echo "$joboption_jobname" | \
           sed 's/^\([^[:alpha:]]\)/N\1/' | \
           sed 's/[^[:alnum:]]/_/g' | \
           sed 's/\(...............\).*/\1/'`
  echo "Description = $jobname" >> $LRMS_JOB_DESCRIPT
else
    jobname="gridjob"
    echo "Description = $jobname" >> $LRMS_JOB_DESCRIPT
fi

# universe
if [ ! -z $DOCKER_UNIVERSE ] ; then
    echo "Universe = $DOCKER_UNIVERSE" >> $LRMS_JOB_DESCRIPT
    echo "should_transfer_files   = YES" >> $LRMS_JOB_DESCRIPT
    echo "when_to_transfer_output = ON_EXIT" >> $LRMS_JOB_DESCRIPT
else
    echo "Universe = vanilla" >> $LRMS_JOB_DESCRIPT
fi

if [ ! -z $DOCKER_IMAGE ] ; then
   echo "docker_image = $DOCKER_IMAGE" >> $LRMS_JOB_DESCRIPT
fi

# notification
echo "Notification = Never" >> $LRMS_JOB_DESCRIPT

# no job restart
REQUIREMENTS="(NumJobStarts == 0)"
PERIODIC_REMOVE="(JobStatus == 1 && NumJobStarts > 0)"

# custom requirements
if [ ! -z "$CONFIG_condor_requirements" ] ; then
  # custom requirement from arc.conf
  REQUIREMENTS="${REQUIREMENTS} && ( $CONFIG_condor_requirements )"
fi
echo "Requirements = ${REQUIREMENTS}" >> $LRMS_JOB_DESCRIPT

#####################################################
# priority
#####################################################
if [ ! -z "$joboption_priority" ]; then
  #Condor uses any integer as priority. 0 being default. Only per user basis.
  #We assume that only grid jobs are relevant.
  #In that case we can use ARC 0-100 but translated so default is 0.
  priority=$((joboption_priority-50))
  echo "Priority = $priority" >> $LRMS_JOB_DESCRIPT
fi

# rank
if [ ! -z "$CONFIG_condor_rank" ] ; then
  echo "Rank = $CONFIG_condor_rank" >> $LRMS_JOB_DESCRIPT
fi

# proxy or token
if [ -f "${joboption_directory}/user.proxy" ]; then
  if  head -n 1 "${joboption_directory}/user.proxy" | grep -F -q 'BEGIN CERTIFICATE' ; then
    echo "x509userproxy = ${joboption_directory}/user.proxy" >> $LRMS_JOB_DESCRIPT
  else
    echo "scitokens_file = ${joboption_directory}/user.proxy" >> $LRMS_JOB_DESCRIPT
  fi
fi

##############################################################
# (non-)parallel jobs
##############################################################

set_count

if [ ! -z $joboption_count ] && [ $joboption_count -gt 0 ] ; then
  echo "request_cpus = $joboption_count" >> $LRMS_JOB_DESCRIPT
fi

if [ "$joboption_exclusivenode" = "true" ]; then
  echo "+RequiresWholeMachine=True" >> $LRMS_JOB_DESCRIPT
fi

##############################################################
# Execution times (minutes)
##############################################################


if [ ! -z "$joboption_cputime" ] ; then
  if [ $joboption_cputime -lt 0 ] ; then
    echo 'WARNING: Less than 0 cpu time requested: $joboption_cputime' 1>&2
    joboption_cputime=0
    echo 'WARNING: cpu time set to 0' 1>&2
  fi
  maxcputime=$(( $joboption_cputime / $joboption_count ))
  echo "+JobCpuLimit = $joboption_cputime" >> $LRMS_JOB_DESCRIPT
  PERIODIC_REMOVE="${PERIODIC_REMOVE} || RemoteUserCpu + RemoteSysCpu > JobCpuLimit"

fi

if [ -z "$joboption_walltime" ] ; then
  if [ ! -z "$joboption_cputime" ] ; then
    # Set walltime for backward compatibility or incomplete requests
    joboption_walltime=$(( $maxcputime * $walltime_ratio ))
  fi
fi

if [ ! -z "$joboption_walltime" ] ; then
  if [ $joboption_walltime -lt 0 ] ; then
    echo 'WARNING: Less than 0 wall time requested: $joboption_walltime' 1>&2
    joboption_walltime=0
    echo 'WARNING: wall time set to 0' 1>&2
  fi
  echo "+JobTimeLimit = $joboption_walltime" >> $LRMS_JOB_DESCRIPT
  PERIODIC_REMOVE="${PERIODIC_REMOVE} || RemoteWallClockTime > JobTimeLimit"
fi

##############################################################
# Requested memory (mb)
##############################################################

set_req_mem

if [ ! -z "$joboption_memory" ] ; then
  memory_bytes=$(( $joboption_memory * 1024 ))
  memory_req=$(( $joboption_memory ))
  # HTCondor needs to know the total memory for the job, not memory per core
  if [ ! -z $joboption_count ] && [ $joboption_count -gt 0 ] ; then
     memory_bytes=$(( $joboption_count * $memory_bytes ))
     memory_req=$(( $joboption_count * $memory_req ))
  fi
  echo "request_memory=$memory_req" >> $LRMS_JOB_DESCRIPT
  echo "+JobMemoryLimit = $memory_bytes" >> $LRMS_JOB_DESCRIPT
  # it is important to protect evaluation from undefined ResidentSetSize
  PERIODIC_REMOVE="${PERIODIC_REMOVE} || ((ResidentSetSize isnt undefined ? ResidentSetSize : 0) > JobMemoryLimit)"
fi

##############################################################
#  HTCondor stage in/out
##############################################################
if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
  (
    cd "$joboption_directory"
    if [ $? -ne '0' ] ; then
      echo "Can't change to session directory: $joboption_directory" 1>&2
      rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_DESCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
      echo "Submission: Configuration error."
      exit 1
    fi
    # transfer all session directory if not shared between ARC CE and worknodes
    scratch_dir=`dirname "$joboption_directory"`
    echo "should_transfer_files = YES" >> $LRMS_JOB_DESCRIPT
    echo "When_to_transfer_output = ON_EXIT_OR_EVICT" >> $LRMS_JOB_DESCRIPT
    echo "Transfer_input_files = $joboption_directory" >> $LRMS_JOB_DESCRIPT
  )
fi

echo "Periodic_remove = ${PERIODIC_REMOVE}" >> $LRMS_JOB_DESCRIPT
echo "Queue" >> $LRMS_JOB_DESCRIPT

echo "#!/bin/bash -l" > $LRMS_JOB_SCRIPT

echo "" >> $LRMS_JOB_SCRIPT
echo "# Overide umask of execution node (sometime values are really strange)" >> $LRMS_JOB_SCRIPT
echo "umask 077" >> $LRMS_JOB_SCRIPT
echo " " >> $LRMS_JOB_SCRIPT

# Script must have execute permission
chmod 0755 $LRMS_JOB_SCRIPT

sourcewithargs_jobscript

##############################################################
# Init accounting
##############################################################
accounting_init

##############################################################
# Add environment variables
##############################################################
add_user_env

##############################################################
# Check for existance of executable,
# there is no sense to check for executable if files are
# downloaded directly to computing node
##############################################################
if [ -z "${joboption_arg_0}" ] ; then
  echo 'Executable is not specified' 1>&2
  rm -f "$LRMS_JOB_SCRIPT" "$LRMS_JOB_DESCRIPT" "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
  echo "Submission: Job description error."
  exit 1
fi

######################################################################
# Adjust working directory for tweaky nodes
# RUNTIME_GRIDAREA_DIR should be defined by external means on nodes
######################################################################
if [ ! -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
  setup_runtime_env
else
  echo "RUNTIME_JOB_DIR=$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid" >> $LRMS_JOB_SCRIPT
  echo "RUNTIME_JOB_DIAG=$RUNTIME_LOCAL_SCRATCH_DIR/${joboption_gridid}.diag" >> $LRMS_JOB_SCRIPT
  RUNTIME_STDIN_REL=`echo "${joboption_stdin}" | sed "s#^${joboption_directory}/*##"`
  RUNTIME_STDOUT_REL=`echo "${joboption_stdout}" | sed "s#^${joboption_directory}/*##"`
  RUNTIME_STDERR_REL=`echo "${joboption_stderr}" | sed "s#^${joboption_directory}/*##"`
  if [ "$RUNTIME_STDIN_REL" = "${joboption_stdin}" ] ; then
    echo "RUNTIME_JOB_STDIN=\"${joboption_stdin}\"" >> $LRMS_JOB_SCRIPT
  else
    echo "RUNTIME_JOB_STDIN=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDIN_REL\"" >> $LRMS_JOB_SCRIPT
  fi
  if [ "$RUNTIME_STDOUT_REL" = "${joboption_stdout}" ] ; then
    echo "RUNTIME_JOB_STDOUT=\"${joboption_stdout}\"" >> $LRMS_JOB_SCRIPT
  else
    echo "RUNTIME_JOB_STDOUT=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDOUT_REL\"" >> $LRMS_JOB_SCRIPT
  fi
  if [ "$RUNTIME_STDERR_REL" = "${joboption_stderr}" ] ; then
    echo "RUNTIME_JOB_STDERR=\"${joboption_stderr}\"" >> $LRMS_JOB_SCRIPT
  else
    echo "RUNTIME_JOB_STDERR=\"$RUNTIME_LOCAL_SCRATCH_DIR/$joboption_gridid/$RUNTIME_STDERR_REL\"" >> $LRMS_JOB_SCRIPT
  fi
fi

##############################################################
# Add std... to job arguments
##############################################################
include_std_streams

##############################################################
#  Move files to local working directory (job is done on node only)
#  RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
##############################################################
move_files_to_node

echo "" >> $LRMS_JOB_SCRIPT
echo "RESULT=0" >> $LRMS_JOB_SCRIPT
echo "" >> $LRMS_JOB_SCRIPT

##############################################################
#  Skip execution if something already failed
##############################################################
echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT

##############################################################
#  Runtime configuration at computing node
##############################################################
RTE_stage1

##############################################################
#  Diagnostics
##############################################################
echo "echo \"runtimeenvironments=\$runtimeenvironments\" >> \"\$RUNTIME_JOB_DIAG\"" >> $LRMS_JOB_SCRIPT
cat >> $LRMS_JOB_SCRIPT <<'EOSCR'
EOSCR

##############################################################
# Accounting (WN OS Detection)
##############################################################
detect_wn_systemsoftware

##############################################################
#  Check intermediate result again
##############################################################
echo "if [ \"\$RESULT\" = '0' ] ; then" >> $LRMS_JOB_SCRIPT

##############################################################
#  Execution
##############################################################
cd_and_run

##############################################################
#  End of RESULT checks
##############################################################
echo "fi" >> $LRMS_JOB_SCRIPT
echo "fi" >> $LRMS_JOB_SCRIPT

##############################################################
#  Runtime (post)configuration at computing node
##############################################################
RTE_stage2

#####################################################
#  Clean up output files in the local scratch dir
#####################################################
clean_local_scratch_dir_output "moveup"

##############################################################
#  Move files back to session directory (job is done on node only)
#  RUNTIME_JOB_DIR -> RUNTIME_LOCAL_SCRATCH_DIR/job_id
##############################################################
move_files_to_frontend

##############################################################
# Finish accounting and exit job
##############################################################
accounting_end

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] submit-condor-job, JobScriptCreation: $t" >> $perflogfilesub
fi

#######################################
#  Submit the job
#######################################
echo "HTCondor job script built" 1>&2
# Execute condor_submit command
cd "$joboption_directory"
echo "HTCondor script follows:" 1>&2
echo "-------------------------------------------------------------------" 1>&2
cat "$LRMS_JOB_SCRIPT" 1>&2
echo "-------------------------------------------------------------------" 1>&2
echo "" 1>&2
CONDOR_RESULT=1
CONDOR_TRIES=0
while [ "$CONDOR_TRIES" -lt '10' ] ; do
  if [ ! -z "$perflogdir" ]; then
    start_ts=`date +%s.%N`
  fi

  ${CONDOR_SUBMIT} $LRMS_JOB_DESCRIPT 1>$LRMS_JOB_OUT 2>$LRMS_JOB_ERR
  CONDOR_RESULT="$?"

  if [ ! -z "$perflogdir" ]; then
    stop_ts=`date +%s.%N`
    t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
    echo "[`date +%Y-%m-%d\ %T`] submit-condor-job, JobSubmission: $t" >> $perflogfilesub
  fi

  if [ "$CONDOR_RESULT" -eq '0' ] ; then break ; fi
  CONDOR_TRIES=$(( $CONDOR_TRIES + 1 ))
  sleep 2
done
if [ $CONDOR_RESULT -eq '0' ] ; then
   job_out=`cat $LRMS_JOB_OUT`

   if [ "${job_out}" = "" ]; then
      echo "job *NOT* submitted successfully!" 1>&2
      echo "failed getting the condor jobid for the job!" 1>&2
      echo "Submission: Local submission client behaved unexpectedly."
   elif [ `echo "${job_out}" | grep -Ec "submitted to cluster\s[0-9]+"` != "1" ]; then
      echo "job *NOT* submitted successfully!" 1>&2
      echo "badly formatted condor jobid for the job !" 1>&2
      echo "Submission: Local submission client behaved unexpectedly."
   else
      job_id=`echo $job_out | grep cluster | awk '{print $8}' | sed 's/[\.]//g'`
      hostname=`hostname -f`
      echo "joboption_jobid=${job_id}.${hostname}" >> $GRAMI_FILE
      echo "condor_log=${joboption_directory}/log" >> $GRAMI_FILE
      echo "job submitted successfully!" 1>&2
      echo "local job id: $job_id" 1>&2
      # Remove temporary files
      rm -f $LRMS_JOB_OUT $LRMS_JOB_ERR
      echo "----- exiting submit_condor_job -----" 1>&2
      echo "" 1>&2
      exit 0
   fi
else
  echo "job *NOT* submitted successfully!" 1>&2
  echo "got error code from condor_submit: $CONDOR_RESULT !" 1>&2
  echo "Submission: Local submission client failed."
fi
echo "Output is:" 1>&2
cat $LRMS_JOB_OUT 1>&2
echo "Error output is:" 1>&2
cat $LRMS_JOB_ERR 1>&2
rm -f "$LRMS_JOB_OUT" "$LRMS_JOB_ERR"
echo "----- exiting submit_condor_job -----" 1>&2
echo "" 1>&2
exit 1
