#!/bin/bash #------------------------------------------------------------------------------# # Script to launch an interactive job onto Atos HPCF #------------------------------------------------------------------------------# # EC_GIT_TAG='{% include "ec_git_tag.j2" %}' # #{% raw %} DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" __ScriptVersion="0.43" # Default values PLATFORM=hpc QOS="ni" NCPUS="2" NGPUS="0" MEMORY="8" TMPSIZE=3 TIME="12:00:00" JOB_OUTPUT=/dev/null #JOB_OUTPUT=%x.out if [[ "$EC_CLUSTER" == "ecs" ]]; then PLATFORM=ecs fi # User override defaults if [[ -e ~/.ecinteractiverc ]]; then source ~/.ecinteractiverc fi # Make sure we can use an existing Temporary directly if [[ -n $TMPDIR ]] && [[ ! -d $TMPDIR ]]; then TMPDIR=/tmp fi SSH="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10 -o LogLevel=ERROR -oSendEnv=STHOST -q" # Session control options do not work on Windows/MobaXterm if [[ -z "${PUTTYHOME:-''}" ]]; then SSH="$SSH -o ControlPath=\"${TMPDIR:-/tmp}/ssh_%r@%h-%p\" -o ControlMaster=auto -o ControlPersist=60" else #MobaXterm does not define USER USER=${USER:-$USERNAME} fi VNC_PASSWORD_FILE=ecmwf_passwd #=== FUNCTION ================================================================ # NAME: usage # DESCRIPTION: Display usage information. #=============================================================================== function usage () { echo "Usage : $0 [options] [--] [CMD] -d|desktop Submits a vnc job (default is interactive ssh job) -j|jupyter Submits a jupyter job (default is interactive ssh job) -J|jupyters Submits a jupyter job with HTTPS support (default is interactive ssh job) More Options: -h|help Display this message -v|version Display script version -p|platform Platform (default $PLATFORM. Choices: hpc, ecs) -u|user ECMWF User (default ${ECUSER:-$USER}) -A|account Project account -c|cpus Number of CPUs (default $NCPUS) -m|memory Requested Memory (default $MEMORY GB) -s|tmpdirsize Requested LOCALSSD/TMPDIR size (default $TMPSIZE GB) -g|gpu Request a GPU (limited availability) -t|time Wall clock limit (default $TIME) -f|forward Ports to forward, comma separated (default NONE) -e|export Environment variables to export, comma separated (default NONE) -k|kill Cancel any running interactive job -q|query Check running job -Q|quiet Silent mode -o|output Output file for the interactive job (default /dev/null) -x set -x" if [[ -w "$DIR/ecinteractive" ]]; then echo " -U|update Update ecinteractive to latest version" fi } # ---------- end of function usage ---------- #=== FUNCTION ================================================================ # NAME: update # DESCRIPTION: Update tool to latest version #=============================================================================== function update () { if [[ ! -w "$DIR/ecinteractive" ]]; then echo -e "ERROR: Insufficient permissions to update" >&2 exit 1 fi curl -s -o /tmp/ecinteractive.tmp https://git.ecmwf.int/projects/USS/repos/ecinteractive/raw/ecinteractive if ! cmp -s /tmp/ecinteractive.tmp $DIR/ecinteractive; then mv /tmp/ecinteractive.tmp $DIR/ecinteractive chmod +x $DIR/ecinteractive newver=$($DIR/ecinteractive -v) echo "Updated ecinteractive to ${newver##* }" else rm /tmp/ecinteractive.tmp fi exit } # ---------- end of function update ---------- #=== FUNCTION ================================================================ # NAME: submit_job # DESCRIPTION: Submits a job into HOST using stdin #=============================================================================== function submit_job { STH="" if [[ -n "$STHOST" ]]; then STH="#SBATCH --export=STHOST" fi RES="" if [[ -n "$RESERVATION" ]]; then RES="#SBATCH --reservation=$RESERVATION" fi if [[ -n "$EXPORT_VARS" ]]; then JOB_ENVIRONMENT="#SBATCH --export=$EXPORT_VARS" fi ACC="" if [[ -n $ACCOUNT ]]; then echo " Accounting project: $ACCOUNT" ACC="#SBATCH --account=$ACCOUNT" fi SSDTMP="#SBATCH --gres=ssdtmp:${TMPSIZE}G" if [[ $NGPUS -gt 0 ]]; then SSDTMP="" fi [[ $QUIET -eq 0 ]] && SBATCH_QUIET="" || SBATCH_QUIET="-Q" $SSH -x $HOST sbatch $SBATCH_QUIET < \$TMPDIR/bin/sbatch < ~/.jupyter/lab/user-settings/jupyterlab-topbar-text/plugin.jupyterlab-settings jupyter server list 2>&1 | grep -q "http[s]\?://.*/?token=[a-z0-9]\+" || jupyter lab --ip=0.0.0.0 --port=\$((UID+30000)) --no-browser \$jupyter_ssl & unset jupyter_ssl set +x } alarm_popup () { set -x left=\${1:-1} howlong=\$(($(to_secs $TIME) - \$left*60)) if [[ \$howlong -gt 0 ]]; then sleep \$howlong msg=\$(echo -e "Job \$SLURM_JOB_ID\\n\$left min left!" | banner) # msg=\$(cowsay Job \$SLURM_JOB_ID: \$left min left!) echo -e "\$msg" | wall DISPLAY=:\$UID /usr/bin/xterm -T "WARNING" -e "echo -e \"\$msg\"; sleep 10" || true fi unset left howlong msg set +x } trap start_vnc USR1 trap start_jupyter USR2 # Set 30, 5 and 1 min alarms alarm_popup 30 & alarm_popup 5 & alarm_popup 1 & set +x while :; do sleep 1 done exit 0 EOFHERE if [[ $? -ne 0 ]]; then exit 1 fi } #=== FUNCTION ================================================================ # NAME: to_secs # DESCRIPTION: Convert Slurm time definitions into seconds #=============================================================================== function to_secs () { _intime=$1 _seconds=0 dayformat=0 if [[ "$_intime" == *"-"* ]]; then _seconds=$((${_intime%-*}*3600*24)) _intime=${_intime#*-} _dayformat=1 elif [[ "$_intime" != *":"* ]]; then echo $((${_intime}*60)) return fi _timearray=(${_intime//:/ }) if [[ ${#_timearray[@]} -eq 1 ]]; then echo $(($_seconds + 10#${_timearray[0]}*3600)) elif [[ ${#_timearray[@]} -eq 2 && $_dayformat -eq 0 ]]; then echo $(($_seconds + 10#${_timearray[0]}*60 + 10#${_timearray[1]})) elif [[ ${#_timearray[@]} -eq 2 && $_dayformat -eq 1 ]]; then echo $(($_seconds + 10#${_timearray[0]}*3600 + 10#${_timearray[1]}*60)) elif [[ ${#_timearray[@]} -eq 3 ]]; then echo $(($_seconds + 10#${_timearray[0]}*3600 + 10#${_timearray[1]}*60 + 10#${_timearray[2]})) else echo 0 fi unset _intime _timearray _seconds } #=== FUNCTION ================================================================ # NAME: get_motd # DESCRIPTION: Get the MOTD #=============================================================================== function get_motd() { [[ $QUIET -eq 0 ]] || return $SSH -x $HOST "[ -x /usr/local/apps/ecinteractive/resources/motd.sh ] && /usr/local/apps/ecinteractive/resources/motd.sh" } #=== FUNCTION ================================================================ # NAME: check_running_job # DESCRIPTION: checks if the job is running #=============================================================================== function check_running_job () { nchecks=${1:-0} icheck=0 while [[ "$state" != "R"* ]]; do naptime=$(($icheck*5*$icheck)) if [[ $naptime -gt 0 && $QUIET -eq 0 ]]; then echo -e "Waiting $naptime seconds for the job to be ready..." fi sleep $naptime query_job icheck=$(($icheck+1)) if [[ $icheck -ge nchecks ]]; then break fi done while [[ "$state" == "C"* ]]; do naptime=10 [[ $QUIET -eq 0 ]] && echo -e "Waiting for the previous job to complete..." sleep $naptime query_job done if [ "$node" == "n/a" ]; then echo "ERROR: Your interactive job $jobid is still in the queue. Try again in a few minutes, or cancel it with $0 $PLATARG-k" >&2 exit 1 fi if [ "$node" != "" ]; then [[ $QUIET -eq 0 ]] && echo -e "Using interactive job:" [[ $QUIET -eq 0 ]] && print_queue_status if [[ "$numcpus" != "$NCPUS" || "$memory" != "${MEMORY}" || ("$tmpsize" != "N/A" && "$tmpsize" != "gres:ssdtmp:${TMPSIZE}G") || "$(to_secs $timelimit)" != "$(to_secs $TIME)" ]]; then [[ $QUIET -eq 0 ]] && echo -e "" [[ $QUIET -eq 0 ]] && echo "WARNING: Your existing job $jobid may have a different setup than requested. Cancel the existing job and rerun if you with to run with different setup" >&2 fi [[ $QUIET -eq 0 ]] && echo -e "" [[ $QUIET -eq 0 ]] && echo -e "To cancel the job:\n\t$0 $PLATARG-k" [[ $QUIET -eq 0 ]] && echo -e "" if [[ $NGPUS -eq 0 ]]; then [[ $QUIET -eq 0 ]] && echo -e "You may restore your saved LOCALSSD from previous sessions with:\n\tec_restore_local_ssd -r" [[ $QUIET -eq 0 ]] && echo -e "" fi if [ $INTERACTIVE -ne 0 ]; then X11="" if [[ -n $DISPLAY ]]; then X11="-X" fi if [[ -n "$EXPORT_VARS" ]]; then SSH_EXPORT="" _oldifs=$IFS IFS="," for elem in $EXPORT_VARS; do if [[ "$elem" == *"="* ]]; then SSH_EXPORT="${elem%=*}=${elem#*=} $SSH_EXPORT" else SSH_EXPORT="$elem=${!elem} $SSH_EXPORT" fi done IFS=$_oldifs if [[ -z "$TO_RUN" ]]; then $SSH -t $X11 $PORT_FORWARD $node "$SSH_EXPORT exec \$SHELL -li" else $SSH -t $X11 $PORT_FORWARD "$node" "$SSH_EXPORT exec \$SHELL -li -c \"$TO_RUN\"" fi else if [[ -z "$TO_RUN" ]]; then $SSH $X11 $PORT_FORWARD "$node" else $SSH -t $X11 $PORT_FORWARD "$node" "exec \$SHELL -li -c \"$TO_RUN\"" fi fi if [[ $? -eq 1 ]]; then [[ $QUIET -eq 0 ]] && echo -e "" [[ $QUIET -eq 0 ]] && echo -e "WARNING: Your job $jobid is still running on $PLATFORM. To reattach to the job:\n\t$0 $PLATARG\n\nTo cancel the job:\n\t$0 $PLATARG-k" [[ $QUIET -eq 0 ]] && echo -e "" exit 0 else exit 1 fi elif [ $JUPYTER -ne 0 ]; then [[ $QUIET -eq 0 ]] && echo -e "Attaching to Jupyterlab session..." sleep 5 $SSH -x $HOST "/usr/local/bin/ecscancel --signal=USR2 -b $jobid" sleep 5 xdg_open=$(which xdg-open > /dev/null && echo xdg-open || echo open) jupyterurl=$($SSH -x $node "bash -lc '[[ -e ~/.ecinteractive/jupyter_setup.sh ]] && source ~/.ecinteractive/jupyter_setup.sh || module load python3; jupyter server list' 2>&1 | grep -o \"http[s]\?://.*/?token=[a-z0-9]\+\" | sed -e \"s/0.0.0.0/\$(hostname -s).ecmwf.int/\" -e \"s/bullx/ecmwf.int/\"") if [[ -z $jupyterurl ]]; then echo -e "ERROR: could not set up a jupyter session" >&2 exit 1 fi if [[ "$jupyterurl" == "https"* && $QUIET -eq 0 ]]; then echo "" echo "Using self-signed certificate for HTTPS connection to Jupyter" echo "To avoid browser warnings fetch ~/.ssl/selfCA.crt from $PLATFORM and import in your browser as trusted Certificate Authority" echo "" fi if [[ -z $ECPLATFORM ]]; then jupyterurl=$(echo $jupyterurl | sed -e "s|//.*:|//localhost:|") jupyterport=$(echo $jupyterurl| cut -d'/' -f3 | cut -d: -f2) if [[ -z $(ps x| grep -- -L$jupyterport | grep -v grep | awk '{print $1}') ]]; then [[ $QUIET -eq 0 ]] && echo -e "Opening SSH tunnel for Jupyter..." $SSH -x -N -f -L$jupyterport:$node:$jupyterport $node fi fi [[ $QUIET -eq 0 ]] && echo -e "To manually re-attach go to $jupyterurl" $xdg_open $jupyterurl &>/dev/null exit 0 elif [ $VNC -ne 0 ]; then [[ $QUIET -eq 0 ]] && echo -e "Attaching to vnc session..." sleep 5 $SSH -x $HOST "/usr/local/bin/ecscancel --signal=USR1 -b $jobid" sleep 5 xdg_open=$(which xdg-open > /dev/null && echo xdg-open || echo open) vnchost=$node vncport=$(($($SSH -x $HOST "id -u")+5900)) if [[ -z $ECPLATFORM ]]; then if [[ -z $(ps x| grep -- -L$vncport | grep -v grep | awk '{print $1}') ]]; then [[ $QUIET -eq 0 ]] && echo -e "Opening SSH tunnel for VNC..." $SSH -x -N -f -L$vncport:$node:$vncport $node fi vnchost="localhost" fi if which vncviewer >/dev/null ;then [[ $QUIET -eq 0 ]] && echo -e "To manually re-attach:\n\tvncviewer -passwd ~/.vnc/$VNC_PASSWORD_FILE $vnchost:$vncport" nohup vncviewer -passwd ~/.vnc/$VNC_PASSWORD_FILE $vnchost:$vncport &>/dev/null & else [[ $QUIET -eq 0 ]] && echo -e "To manually re-attach, connect to vnc://$vnchost:$vncport" $xdg_open vnc://$vnchost:$vncport fi exit 0 fi fi } #=== FUNCTION ================================================================ # NAME: check_vnc_password # DESCRIPTION: Checks if the user has ~/.vnc/ecmwf_passwd. If not, it will # create one automatically and sync with client if needed #=============================================================================== function check_vnc_password () { $SSH -x $HOST "[ ! -f ~/.vnc/$VNC_PASSWORD_FILE ] && mkdir -p ~/.vnc && vncpasswd -f <<< \$(dd if=/dev/random bs=8 count=1 | base64 | head -c 18) > ~/.vnc/$VNC_PASSWORD_FILE" mkdir -p ~/.vnc rsync -q -e "$SSH -x" -a $HOST:.vnc/$VNC_PASSWORD_FILE ~/.vnc/$VNC_PASSWORD_FILE } #=== FUNCTION ================================================================ # NAME: cancel_jobs # DESCRIPTION: Cancels the interactive jobs #=============================================================================== function cancel_jobs() { query_job if [[ -n "$jobid" ]]; then [[ $QUIET -eq 0 ]] && echo "cancelling job $jobid..." [[ $QUIET -eq 0 ]] && print_queue_status $SSH -x -t $HOST "/usr/local/bin/ecscancel -b -i $jobid" else [[ $QUIET -eq 0 ]] && echo "No ${JOBTYPE} jobs found to cancel on $PLATFORM" fi exit 0 } #=== FUNCTION ================================================================ # NAME: query_job # DESCRIPTION: Checks if there is any interactive job running #=============================================================================== function query_job() { squeue=$($SSH -x $HOST "/usr/local/bin/ecsqueue -u \$USER -n $NAME -q $QOS -O cluster:8,jobid:12,state:12,batchhost:10,timelimit:.11,timeleft:.11,maxcpus:.6,minmemory:.8,tres-per-node:.16$SQUEUE_GPU_TRES") if [[ $? -ne 0 ]]; then echo -e "ERROR: could not get job state for user $USER" >&2 exit 1 fi jobline=($(echo -e "$squeue" | sed "1 d")) if [ "$jobline" != "" ]; then cluster=${jobline[0]} jobid=${jobline[1]} state=${jobline[2]} node=${jobline[3]} timelimit=${jobline[4]} timeleft=${jobline[5]} numcpus=${jobline[6]} memory=${jobline[7]} memory=${memory//[ gbGBmM]/} tmpsize=${jobline[8]} gpus=${jobline[9]} [[ -n "$node" ]] && \ [[ -f ~/.ssh/config ]] && \ grep -q -e "^[[:blank:]]*Include[[:blank:]]\+ecinteractive_ssh_config" ~/.ssh/config && \ cat > ~/.ssh/ecinteractive_ssh_config <&2 usage; exit 1 ;; esac # --- end of case --- done shift $((OPTIND-1)) TO_RUN="$@" if [[ ! -z "$ECUSER" ]]; then SSH="$SSH -l $ECUSER" USER=$ECUSER fi NAME=$USER-ecinteractive HOST=$PLATFORM-batch if [[ "$PLATFORM" == "ecs"* ]]; then QOS=ei fi if [[ "$PLATFORM" != "$LOCAL" ]]; then PLATARG="-p $PLATFORM " fi if [[ "$PLATFORM" != "ecs" && "$PLATFORM" != "hpc" ]]; then echo "ERROR: Platform not valid. Choose hpc, ecs" >&2 exit 1 fi if [[ $NGPUS -gt 0 ]]; then NAME=$USER-ecinteractive-g GPU="#SBATCH -G $NGPUS" QOS=ng PLATARG="-g " SQUEUE_GPU_TRES=",tres-per-job:.15" JOBTYPE="GPU-enabled interactive" if [[ $CUSTOM_TIME -eq 0 ]]; then TIME="02:00:00" elif [[ $(( $QUERY + $KILL + $UPDATE)) -eq 0 && $(to_secs $TIME) -gt 43200 ]]; then echo "WARNING: capping time limit to 12 h" TIME="12:00:00" fi else JOBTYPE="interactive" GPU="" SQUEUE_GPU_TRES="" fi if [[ -z "$ECPLATFORM" ]]; then if [[ $VNC -ne 0 || $JUPYTER -ne 0 ]]; then [[ $QUIET -eq 0 ]] && echo "WARNING: Running outside ECMWF premises. Support for VNC/Jupyter limited" >&2 fi fi if [[ -n "$QUERY" ]]; then get_motd query_job if [[ -n "$jobid" ]]; then print_queue_status else echo "No ${JOBTYPE} jobs found on $PLATFORM" fi exit 0 fi if [[ -n "$KILL" ]]; then cancel_jobs fi if [ $VNC -ne 0 ]; then if [[ -n $DISPLAY ]]; then : else echo "DISPLAY is not set. You are not running from a graphical session" >&2 exit 1 fi check_vnc_password fi if [[ -n $PORT_FORWARD ]]; then _to_forward="" for fport in $(echo $PORT_FORWARD | tr "," " "); do _to_forward="$_to_forward -L$fport:localhost:$fport" done PORT_FORWARD=$_to_forward fi get_motd check_running_job submit_job check_running_job 5 exit 0 #{% endraw %}