#!/bin/bash # Version 07.2019 #################### Job Settings ################################################################# # Specific Commands for the work load manager SLURM are lines beginning with "#SBATCH" #SBATCH -J tst # Setting the display name for the submission #SBATCH -N 4 # Number of nodes to reserve, -N 2-5 for variable number of requested node count #SBATCH --ntasks-per-node 16 # typically 16, range: 1..16 (max 16 cores per node) #SBATCH -t 001:00:00 # set walltime in hours, format: hhh:mm:ss, days-hh, days-hhh:mm:ss #SBATCH -p short # Desired Partition, alternatively comment this line out and submit the script with 'sbatch -p big jobscript.sh' #SBATCH --mem 120000 # A Default Memory limit in MB. #################### Simulation Settings ########################################################## ## Work directory. No "/" at the end. WORKDIR="/scratch/tmp/$USER/my_sim_dir" ## Simulation File (location in work directory) SIMULATIONFILE="star.sim" ## Macro file (location in work directory) MACROFILE="macro.java" ## Personal POD key PERSONAL_PODKEY="XXXXXXXXXXXXXXXXXXXXXX" ## Decide which version by commenting out the desired version. #module load starCCM/11.06.011 #module load starCCM/12.02.011 module load starCCM/13.02.013 ## Application. Can be kept constant if modules are used. APPLICATION="starccm+" ## for macro usage OPTIONS="$WORKDIR/$SIMULATIONFILE -batch $WORKDIR/$MACROFILE -licpath 1999@flex.cd-adapco.com -power -podkey $PERSONAL_PODKEY -collab -time -rsh /usr/bin/ssh" ## just run #OPTIONS="$WORKDIR/$SIMULATIONFILE -batch run -licpath 1999@flex.cd-adapco.com -power -podkey $PERSONAL_PODKEY -collab -time -rsh /usr/bin/ssh" #################### Printing some Debug Information ############################################## # simplify debugging: echo "SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST" echo "SLURM_NNODES=$SLURM_NNODES SLURM_TASKS_PER_NODE=$SLURM_TASKS_PER_NODE" env | grep -e MPI -e SLURM echo "host=$(hostname) pwd=$(pwd) ulimit=$(ulimit -v) \$1=$1 \$2=$2" exec 2>&1 # send errors into stdout stream # list and echo loaded Modules echo "Loaded Modules: $LOADEDMODULES" ## Change into Work Directory cd $WORKDIR; echo pwd=$(pwd) # export OMP_WAIT_POLICY="PASSIVE" export OMP_NUM_THREADS=$((16/((SLURM_NPROCS+SLURM_NNODES-1)/SLURM_NNODES))) [ $OMP_NUM_THREADS == 16 ] && export GOMP_CPU_AFFINITY="0-15:1" # task-specifique export OMP_PROC_BIND=TRUE echo OMP_NUM_THREADS=$OMP_NUM_THREADS [ "$SLURM_NNODES" ] && [ $SLURM_NNODES -lt 4 ] && srun bash -c "echo task \$SLURM_PROCID of \$SLURM_NPROCS runs on \$SLURMD_NODENAME" #################### Preparing the Simulation ##################################################### ## creating machinefile & temp in work directory MACHINEFILE="machinefile.$SLURM_JOBID.txt" TEMPMACHINEFILE="machinefile.temp.$SLURM_JOBID.txt" scontrol show hostnames $SLURM_JOB_NODELIST > $WORKDIR/$TEMPMACHINEFILE touch $WORKDIR/$MACHINEFILE ## Check the reserved nodes for running processes, when user processes are left on reserved nodes then remove them from machinefile. echo "Checking whether servers are clean (of user processes)" for s in $(cat $WORKDIR/$TEMPMACHINEFILE) do serverload=$(timeout 10s ssh $s ps aux | \ tail -n +2 | \ sed '/sshd/d;/ps aux/d;/slurm_script/d;/cut/d;/sort/d;/sed/d;/tail/d;/uniq/d' | \ cut -d' ' -f1 | \ sort| \ sed '/root/d;/munge/d;/dbus/d;/ldap/d;/nslcd/d;/postfix/d;/ntp/d;/nscd/d' | \ uniq -c ) if [ -n "$serverload" ] ; then echo "=== ERROR === On server ${s}, user processes are running: \"${serverload}\"" else echo "${s} seems clear." echo "${s}" >> $WORKDIR/$MACHINEFILE fi done rm $WORKDIR/$TEMPMACHINEFILE ## Calculating remaining NPROCS CLEAN_NNODES=$(cat $WORKDIR/$MACHINEFILE | wc -l) CLEAN_NPROCS=$(( $CLEAN_NNODES * $SLURM_NTASKS_PER_NODE )) echo "Finally Running on $CLEAN_NPROCS processes on $CLEAN_NNODES servers." #################### Running the simulation ####################################################### ## Let StarCCM+ wait for licenses on startup export STARWAIT=1 ## Start time stamp date +%Y-%m-%d_%H:%M:%S_%s_%Z # date as YYYY-MM-DD_HH:MM:SS_Ww_ZZZ ## Command to run application (StarCCM+) echo "Now, running the simulation ...." $APPLICATION $OPTIONS -np $CLEAN_NPROCS -machinefile $WORKDIR/$MACHINEFILE > $WORKDIR/$SIMULATIONFILE.$SLURM_JOBID.output.log 2>&1 ## Final time stamp date +%Y-%m-%d_%H:%M:%S_%s_%Z #################### Brute Force Clean-Up ######################################################### ## This part kills ALL starccm processes (of your account in this job) after a run. ## Though, it will only run, if starccm is not killed by SLRUM before... ## ## Therefore, MAKE SURE starccm stops itself before the job is killed (see LSS wiki) ## ## Probably, this part has become obsolete. echo "Start Brute-Force clean up" [ "$UID" != "0" ] && srun --ntasks-per-node=1 bash -c \ "find /dev/shm /tmp -xdev -mindepth 1 -maxdepth 1 -user $USER \ -exec rm -rf --one-file-system {} \;" [ "$UID" != "0" ] && srun --ntasks-per-node=1 bash -c \ "pkill -u $UID -9 star " 2>/dev/null [ "$UID" != "0" ] && srun --ntasks-per-node=1 bash -c \ "pkill -u $UID -9 mpid " 2>/dev/null echo "done."